Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion packages/openmemory-js/src/memory/hsg.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import crypto from "node:crypto";
import { canonical_token_set } from "../utils/text";
import { canonical_token_set, stable_text_fallback_hash } from "../utils/text";
import { inc_q, dec_q, on_query_hit } from "./decay";
import { env, tier } from "../core/cfg";
import { cos_sim, buf_to_vec, vec_to_buf } from "../utils/index";
Expand Down Expand Up @@ -295,6 +295,9 @@ export function boosted_sim(s: number): number {
}
export function compute_simhash(text: string): string {
const tokens = canonical_token_set(text);
if (!tokens.size) {
return stable_text_fallback_hash(text);
}
const hashes = Array.from(tokens).map((t) => {
let h = 0;
for (let i = 0; i < t.length; i++) {
Expand Down
25 changes: 23 additions & 2 deletions packages/openmemory-js/src/utils/text.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import crypto from "node:crypto";

const syn_grps = [
["prefer", "like", "love", "enjoy", "favor"],
["theme", "mode", "style", "layout"],
Expand Down Expand Up @@ -32,13 +34,28 @@ const stem_rules: Array<[RegExp, string]> = [
[/ed$/, ""],
[/s$/, ""],
];
const tok_pat = /[a-z0-9]+/gi;
const cjk_pat = /[\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\u3040-\u30ff\uac00-\ud7af]+/u;
const tok_pat = /[a-z0-9]+|[\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\u3040-\u30ff\uac00-\ud7af]+/giu;

const expand_cjk_token = (tok: string): string[] => {
if (tok.length <= 1) return [tok];
const expanded: string[] = [];
for (let i = 0; i < tok.length - 1; i++) {
expanded.push(tok.slice(i, i + 2));
}
return expanded;
};

export const tokenize = (text: string): string[] => {
const toks: string[] = [];
let m: RegExpExecArray | null;
while ((m = tok_pat.exec(text))) {
toks.push(m[0].toLowerCase());
const tok = m[0];
if (cjk_pat.test(tok)) {
toks.push(...expand_cjk_token(tok));
continue;
}
toks.push(tok.toLowerCase());
}
return toks;
};
Expand Down Expand Up @@ -102,6 +119,10 @@ export const canonical_token_set = (text: string): Set<string> => {
return new Set(canonical_tokens_from_text(text));
};

export const stable_text_fallback_hash = (text: string): string => {
return crypto.createHash("blake2b512").update(text, "utf8").digest("hex").slice(0, 16);
};

export const add_synonym_tokens = (toks: Iterable<string>): Set<string> => {
const res = new Set<string>();
for (const tok of toks) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import assert from "node:assert/strict";
import { canonical_tokens_from_text, tokenize } from "../src/utils/text";
import { compute_simhash } from "../src/memory/hsg";

const left = "我喜欢健身";
const right = "我喜欢普洱茶";

assert.deepEqual(tokenize(right), ["我喜", "喜欢", "欢普", "普洱", "洱茶"]);

const leftTokens = canonical_tokens_from_text(left);
const rightTokens = canonical_tokens_from_text(right);

assert.ok(leftTokens.length > 0);
assert.ok(rightTokens.length > 0);
assert.notDeepEqual(new Set(leftTokens), new Set(rightTokens));
assert.notEqual(compute_simhash(left), compute_simhash(right));
assert.notEqual(compute_simhash("!!!"), compute_simhash("???"));
assert.equal(compute_simhash("!!!"), compute_simhash("!!!"));

console.log("test_multilingual_dedup.ts passed");
4 changes: 3 additions & 1 deletion packages/openmemory-py/src/openmemory/memory/hsg.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from ..core.config import env
from ..core.constants import SECTOR_CONFIGS
from ..core.vector_store import vector_store as store
from ..utils.text import canonical_token_set, canonical_tokens_from_text
from ..utils.text import canonical_token_set, canonical_tokens_from_text, stable_text_fallback_hash
from ..utils.chunking import chunk_text
from ..utils.keyword import keyword_filter_memories, compute_keyword_overlap
from ..utils.vectors import buf_to_vec, vec_to_buf, cos_sim
Expand Down Expand Up @@ -164,6 +164,8 @@ def boosted_sim(s: float) -> float:

def compute_simhash(text: str) -> str:
tokens = canonical_token_set(text)
if not tokens:
return stable_text_fallback_hash(text)
hashes = []
for t in tokens:
h = 0
Expand Down
23 changes: 21 additions & 2 deletions packages/openmemory-py/src/openmemory/utils/text.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
import hashlib
from typing import List, Set, Dict

SYN_GRPS = [
Expand Down Expand Up @@ -35,10 +36,24 @@
(r"s$", ""),
]

TOK_PAT = re.compile(r"[a-z0-9]+")
CJK_PAT = r"\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\u3040-\u30ff\uac00-\ud7af"
TOK_PAT = re.compile(rf"[a-z0-9]+|[{CJK_PAT}]+", re.I)


def _expand_cjk_token(tok: str) -> List[str]:
if len(tok) <= 1:
return [tok]
return [tok[i : i + 2] for i in range(len(tok) - 1)]

def tokenize(text: str) -> List[str]:
return [m.lower() for m in TOK_PAT.findall(text)]
res: List[str] = []
for tok in TOK_PAT.findall(text):
low = tok.lower()
if re.fullmatch(rf"[{CJK_PAT}]+", tok):
res.extend(_expand_cjk_token(tok))
else:
res.append(low)
return res

def stem(tok: str) -> str:
if len(tok) <= 3: return tok
Expand Down Expand Up @@ -85,3 +100,7 @@ def build_fts_query(text: str) -> str:

def canonical_token_set(text: str) -> Set[str]:
return set(canonical_tokens_from_text(text))


def stable_text_fallback_hash(text: str) -> str:
return hashlib.blake2b(text.encode("utf-8"), digest_size=8).hexdigest()
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
from __future__ import annotations

import importlib.util
import sys
import types
from pathlib import Path


ROOT = Path(__file__).resolve().parents[1] / "src" / "openmemory"


def _ensure_pkg(name: str) -> types.ModuleType:
mod = sys.modules.get(name)
if mod is None:
mod = types.ModuleType(name)
mod.__path__ = [] # type: ignore[attr-defined]
sys.modules[name] = mod
return mod


def _stub_module(name: str, **attrs: object) -> None:
mod = types.ModuleType(name)
for key, value in attrs.items():
setattr(mod, key, value)
sys.modules[name] = mod


def _load_module(name: str, path: Path):
spec = importlib.util.spec_from_file_location(name, path)
module = importlib.util.module_from_spec(spec)
assert spec and spec.loader
sys.modules[name] = module
spec.loader.exec_module(module)
return module


def _load_text_and_hsg():
_ensure_pkg("openmemory")
_ensure_pkg("openmemory.utils")
_ensure_pkg("openmemory.memory")
_ensure_pkg("openmemory.core")
_ensure_pkg("openmemory.ops")

text = _load_module("openmemory.utils.text", ROOT / "utils" / "text.py")

_stub_module("openmemory.core.db", q=None, db=None, transaction=lambda: None)
_stub_module("openmemory.core.config", env=types.SimpleNamespace())
_stub_module("openmemory.core.constants", SECTOR_CONFIGS={})
_stub_module("openmemory.core.vector_store", vector_store=None)
_stub_module("openmemory.utils.chunking", chunk_text=lambda *args, **kwargs: [])
_stub_module(
"openmemory.utils.keyword",
keyword_filter_memories=lambda *args, **kwargs: [],
compute_keyword_overlap=lambda *args, **kwargs: 0.0,
)
_stub_module(
"openmemory.utils.vectors",
buf_to_vec=lambda *args, **kwargs: [],
vec_to_buf=lambda *args, **kwargs: b"",
cos_sim=lambda *args, **kwargs: 0.0,
)
_stub_module(
"openmemory.memory.embed",
embed_multi_sector=lambda *args, **kwargs: {},
embed_for_sector=lambda *args, **kwargs: [],
calc_mean_vec=lambda *args, **kwargs: [],
)
_stub_module(
"openmemory.memory.decay",
inc_q=lambda *args, **kwargs: None,
dec_q=lambda *args, **kwargs: None,
on_query_hit=lambda *args, **kwargs: None,
calc_recency_score=lambda *args, **kwargs: 0.0,
pick_tier=lambda *args, **kwargs: "cold",
)
_stub_module(
"openmemory.ops.dynamics",
calculateCrossSectorResonanceScore=lambda *args, **kwargs: 0.0,
applyRetrievalTraceReinforcementToMemory=lambda *args, **kwargs: None,
propagateAssociativeReinforcementToLinkedNodes=lambda *args, **kwargs: None,
)
_stub_module("openmemory.memory.user_summary", update_user_summary=lambda *args, **kwargs: None)

hsg = _load_module("openmemory.memory.hsg", ROOT / "memory" / "hsg.py")
return text, hsg


TEXT, HSG = _load_text_and_hsg()


def test_tokenize_expands_cjk_bigrams():
assert TEXT.tokenize("我喜欢普洱茶") == ["我喜", "喜欢", "欢普", "普洱", "洱茶"]


def test_canonical_tokens_keep_distinct_chinese_content():
left = TEXT.canonical_tokens_from_text("我喜欢健身")
right = TEXT.canonical_tokens_from_text("我喜欢普洱茶")

assert left
assert right
assert set(left) != set(right)


def test_compute_simhash_avoids_constant_hash_for_distinct_chinese_inputs():
left = HSG.compute_simhash("我喜欢健身")
right = HSG.compute_simhash("我喜欢普洱茶")

assert left != right


def test_compute_simhash_uses_stable_fallback_when_tokenizer_finds_nothing():
left = HSG.compute_simhash("!!!")
right = HSG.compute_simhash("???")

assert left != right
assert left == HSG.compute_simhash("!!!")