import math
a = {
"california": 95,
"business": 601,
"tax": 354,
"return": 607
}
n = 250000
l_avg = 4
def calc_score(new_query, given_query):
score = 0
for term in given_query:
ftd = new_query.count(term)
ftd_ = ftd * math.log(1 + l_avg / len(new_query), 2)
num = math.log((1 + a[term] / n), 2) + ftd_ * math.log((1 + n / a[term]), 2)
den = (ftd_ + 1)
score += num / den
print(score)
new_query = ["california", "business", "tax"]
given_query = ["california", "business"]
calc_score(new_query, given_query)
new_query = ["california", "business", "tax", "return"]
calc_score(new_query, given_query)
["california", "business", "tax"] ---- > 11.038836781376123
["california", "business", "tax", "return"] ----> 10.035042371506819
(Edited: 2022-04-27)
package CS267;
public class InClassExercise_27Apr {
public static void main(String args[])
{
double n = 250000;
double l_t[] = new double[]{95, 601, 354, 607};
double f_t_d[] = new double[]{1,1,1,1};
double fprime_t_d[] = new double[4];
double l_avg =4;
// California Business Tax
double score = 0;
double l_d = 3;
for(int i=0;i<4;i++)
{
fprime_t_d[i] = f_t_d[i] * (Math.log(1+ (l_avg/l_d))/Math.log(2));
}
for(int i =0;i<2;i++)
{
double numerator = (Math.log(1+ (l_t[i]/n))/Math.log(2)) + fprime_t_d[i] * (Math.log(1+ (n/l_t[i]))/Math.log(2));
double denominator = fprime_t_d[i] + 1;
score = score + numerator/denominator;
}
System.out.println("California Business Tax "+score);
// California Business Tax Return
score = 0;
l_d = 4;
for(int i=0;i<4;i++)
{
fprime_t_d[i] = f_t_d[i] * (Math.log(1+ (l_avg/l_d))/Math.log(2));
}
for(int i =0;i<2;i++)
{
double numerator = (Math.log(1+ (l_t[i]/n))/Math.log(2)) + fprime_t_d[i] * (Math.log(1+ (n/l_t[i]))/Math.log(2));
double denominator = fprime_t_d[i] + 1;
score = score + numerator/denominator;
}
System.out.println("California Business Tax Return "+score);
}
}
Given: size(corpus) = 250000 lavg(query) = 4 n(california) = 95, n(business) = 601, n(tax) = 354, n(return) = 607 DFR = sum[log(1+lt/N)+f't,d*log(1+N/lt) / f't,d+1] 1. california business tax ld = 3 ft,d (for each) = 1 f't,d (for each) = ft,d*log(1+lavg/ld) = 1*log(1+4/3) = log(7/3) = 1.22 DFR (california) = log(1+95/250000)+1.22*log(1+250000/95) / 2.22 = log(1.00038)+1.22*log(2632.58) / 2.22 = 0.00055 + 1.22*11.36 / 2.22 = 6.243 DFR(business) = log(1+601/250000)+1.22*log(1+250000/601) / 2.22 = log(1.0024)+1.22*log(416.97) / 2.22 = 0.0034+1.22*8.70 / 2.22 = 4.782 DFR(total_1) = 6.243 + 4.782 = 11.025 2. california business tax return ld = 4 ft,d (for each) = 1 f't,d (for each) = ft,d*log(1+lavg/ld) = 1*log(1+4/4) = log(2) = 1 DFR (california) = log(1+95/250000)+1*log(1+250000/95) / 2 = log(1.00038)+log(2632.58) / 2 = 0.00055 + 11.36 / 2 = 5.68 DFR(business) = log(1+601/250000)+1*log(1+250000/601) / 2 = log(1.0024)+log(416.97) / 2 = 0.0034+8.70 / 2 = 4.352 DFR(total_2) = 5.68 + 4.352 = 10.032
DFR = 0
ftd = 1
ftd_prime = ftd * math.log(1 + l_avg / ld, 2)
for term in query:
DFR += (math.log2(1 + occ[term] / N) + ftd_prime * math.log2(1 + N / occ[term])) \
/ (ftd_prime + 1)
return DFR
N = 250000
l_avg = 4
occ = {"california": 95, "business": 601, "tax": 354, "return": 607}
given = ["california", "business"]
query1 = ["california", "business", "tax"]
print(query1, score(given, len(query1)))
query2 = ["california", "business", "tax", "return"]
print(query2, score(given, len(query2)))