From e1e6a3eca2824eccd4ab68daf742ccdd6391e255 Mon Sep 17 00:00:00 2001
From: Sophia <ss5837@drexel.edu>
Date: Sun, 2 Mar 2025 09:25:04 -0500
Subject: [PATCH] fix:updated embedding cleaning

---
 backend/scripts/rank.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/backend/scripts/rank.py b/backend/scripts/rank.py
index e415322..fa30398 100644
--- a/backend/scripts/rank.py
+++ b/backend/scripts/rank.py
@@ -18,15 +18,17 @@
 """Pre-Processing for Embedding"""
 
 def clean_for_embedding(to_clean):
-    text = to_clean
-    # Remove special characters that don't add meaning
-    text = re.sub(r'[^\w\s.,;:!?()-]', ' ', text)
+    text = to_clean.lower()
+    # Remove special characters and punctuation
+    # text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
+    # # Normalize the numbers
+    # text = re.sub(r'\d+', '#', text)
     # Remove extra whitespace
     text = re.sub(r'\s+', ' ', text).strip()
-    # Remove common filler phrases that don't add semantic value
-    fillers = ["click here", "read more", "learn more", "cookies", "privacy policy"]
-    for filler in fillers:
-        text = text.replace(filler, "")
+    # # Remove common filler phrases that don't add semantic value
+    # fillers = ["click here", "read more", "learn more", "cookies", "privacy policy"]
+    # for filler in fillers:
+    #     text = text.replace(filler, "")
 
     return text
 
@@ -34,9 +36,9 @@ def clean_for_embedding(to_clean):
 def extract_keywords(text, top_n=5):
     # Simple keyword extraction by frequency
     words = re.findall(r'\b[a-zA-Z]{3,15}\b', text.lower())
-    common_words = ["the", "and", "for", "this", "that", "with", "from"]
-    filtered = [w for w in words if w not in common_words]
-    return [word for word, _ in Counter(filtered).most_common(top_n)]
+    # common_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
+    # filtered = [w for w in words if w not in common_words]
+    return [word for word, _ in Counter(words).most_common(top_n)]
 
 # From Claude:
 def add_keywords(text):