-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTF_IDF.cpp
More file actions
79 lines (68 loc) · 2.77 KB
/
TF_IDF.cpp
File metadata and controls
79 lines (68 loc) · 2.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
//This is the implementtion of TF_IDF algorithm for the document ranking
#include <iostream>
#include <vector>
#include <map>
#include <cmath>
#include <string>
#include <sstream>
#include <algorithm>
#include <unordered_map>
#include <set>
#include "invertedIndex.cpp"
using namespace std;
void TF_IDF()
{
unordered_map<string, unordered_map<int, int>> tokenANDInvertedIndex = index();
set<string> allCollectedTokens;
for (auto it : tokenANDInvertedIndex) allCollectedTokens.insert(it.first); // create a vocabulary that includes all unique words from all documents.
int N = tokenANDInvertedIndex.size(); // Total number of documents
// Step 1: Calculate Term Frequency (TF)
unordered_map<int, unordered_map<string, double>> termFrequency;
for (auto& [term, docMap] : tokenANDInvertedIndex) {
for (auto& [docID, count] : docMap) {
termFrequency[docID][term] = (double)count / docMap.size();
}
}
// Step 2: Compute Inverse Document Frequency (IDF)
unordered_map<string, double> inverseDocumentFrequency;
for (auto& term : allCollectedTokens) {
int df = tokenANDInvertedIndex[term].size();
inverseDocumentFrequency[term] = log((double)N / (1 + df));
}
// Step 3: Compute TF-IDF
unordered_map<int, unordered_map<string, double>> tfidf;
for (auto& [docID, tfMap] : termFrequency) {
for (auto& [term, tf] : tfMap) {
tfidf[docID][term] = tf * inverseDocumentFrequency[term];
}
}
// Step 4: Document Ranking (Cosine Similarity)
// Assuming we have a query vector (queryTFIDF) to compare with
unordered_map<string, double> queryTFIDF; // This should be computed similarly to tfidf for the query
// Function to compute cosine similarity
auto cosineSimilarity = [](unordered_map<string, double>& vec1, unordered_map<string, double>& vec2) {
double dotProduct = 0.0, normA = 0.0, normB = 0.0;
for (auto& [term, val] : vec1) {
dotProduct += val * vec2[term];
normA += val * val;
}
for (auto& [term, val] : vec2) {
normB += val * val;
}
return dotProduct / (sqrt(normA) * sqrt(normB));
};
// Rank documents based on cosine similarity with the query
vector<pair<int, double>> docScores;
for (auto& [docID, docTFIDF] : tfidf) {
double score = cosineSimilarity(docTFIDF, queryTFIDF);
docScores.push_back({docID, score});
}
// Sort documents by score in descending order
sort(docScores.begin(), docScores.end(), [](pair<int, double>& a, pair<int, double>& b) {
return a.second > b.second;
});
// Output ranked documents
for (auto& [docID, score] : docScores) {
cout << "Document ID: " << docID << " Score: " << score << endl;
}
}