import math a = { "california": 95, "business": 601, "tax": 354, "return": 607 } n = 250000 l_avg = 4 def calc_score(new_query, given_query): score = 0 for term in given_query: ftd = new_query.count(term) ftd_ = ftd * math.log(1 + l_avg / len(new_query), 2) num = math.log((1 + a[term] / n), 2) + ftd_ * math.log((1 + n / a[term]), 2) den = (ftd_ + 1) score += num / den print(score) new_query = ["california", "business", "tax"] given_query = ["california", "business"] calc_score(new_query, given_query) new_query = ["california", "business", "tax", "return"] calc_score(new_query, given_query) ["california", "business", "tax"] ---- > 11.038836781376123 ["california", "business", "tax", "return"] ----> 10.035042371506819(Edited: 2022-04-27)
package CS267; public class InClassExercise_27Apr { public static void main(String args[]) { double n = 250000; double l_t[] = new double[]{95, 601, 354, 607}; double f_t_d[] = new double[]{1,1,1,1}; double fprime_t_d[] = new double[4]; double l_avg =4; // California Business Tax double score = 0; double l_d = 3; for(int i=0;i<4;i++) { fprime_t_d[i] = f_t_d[i] * (Math.log(1+ (l_avg/l_d))/Math.log(2)); } for(int i =0;i<2;i++) { double numerator = (Math.log(1+ (l_t[i]/n))/Math.log(2)) + fprime_t_d[i] * (Math.log(1+ (n/l_t[i]))/Math.log(2)); double denominator = fprime_t_d[i] + 1; score = score + numerator/denominator; } System.out.println("California Business Tax "+score); // California Business Tax Return score = 0; l_d = 4; for(int i=0;i<4;i++) { fprime_t_d[i] = f_t_d[i] * (Math.log(1+ (l_avg/l_d))/Math.log(2)); } for(int i =0;i<2;i++) { double numerator = (Math.log(1+ (l_t[i]/n))/Math.log(2)) + fprime_t_d[i] * (Math.log(1+ (n/l_t[i]))/Math.log(2)); double denominator = fprime_t_d[i] + 1; score = score + numerator/denominator; } System.out.println("California Business Tax Return "+score); } }
Given: size(corpus) = 250000 lavg(query) = 4 n(california) = 95, n(business) = 601, n(tax) = 354, n(return) = 607 DFR = sum[log(1+lt/N)+f't,d*log(1+N/lt) / f't,d+1] 1. california business tax ld = 3 ft,d (for each) = 1 f't,d (for each) = ft,d*log(1+lavg/ld) = 1*log(1+4/3) = log(7/3) = 1.22 DFR (california) = log(1+95/250000)+1.22*log(1+250000/95) / 2.22 = log(1.00038)+1.22*log(2632.58) / 2.22 = 0.00055 + 1.22*11.36 / 2.22 = 6.243 DFR(business) = log(1+601/250000)+1.22*log(1+250000/601) / 2.22 = log(1.0024)+1.22*log(416.97) / 2.22 = 0.0034+1.22*8.70 / 2.22 = 4.782 DFR(total_1) = 6.243 + 4.782 = 11.025 2. california business tax return ld = 4 ft,d (for each) = 1 f't,d (for each) = ft,d*log(1+lavg/ld) = 1*log(1+4/4) = log(2) = 1 DFR (california) = log(1+95/250000)+1*log(1+250000/95) / 2 = log(1.00038)+log(2632.58) / 2 = 0.00055 + 11.36 / 2 = 5.68 DFR(business) = log(1+601/250000)+1*log(1+250000/601) / 2 = log(1.0024)+log(416.97) / 2 = 0.0034+8.70 / 2 = 4.352 DFR(total_2) = 5.68 + 4.352 = 10.032
DFR = 0 ftd = 1 ftd_prime = ftd * math.log(1 + l_avg / ld, 2) for term in query: DFR += (math.log2(1 + occ[term] / N) + ftd_prime * math.log2(1 + N / occ[term])) \ / (ftd_prime + 1) return DFR
N = 250000 l_avg = 4 occ = {"california": 95, "business": 601, "tax": 354, "return": 607}
given = ["california", "business"] query1 = ["california", "business", "tax"] print(query1, score(given, len(query1))) query2 = ["california", "business", "tax", "return"] print(query2, score(given, len(query2)))