From b46974c774546d1a2637cf34f47a07fe39735405 Mon Sep 17 00:00:00 2001 From: Koen Vossen Date: Tue, 7 Apr 2026 10:26:59 +0200 Subject: [PATCH 1/3] Support string keys in BucketTransformation with bucket_count Uses MD5 hash for stable distribution when the value cannot be cast to int. Integer values continue to use direct modulo. --- ingestify/domain/services/identifier_key_transformer.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ingestify/domain/services/identifier_key_transformer.py b/ingestify/domain/services/identifier_key_transformer.py index 0aa86b3..3e22c64 100644 --- a/ingestify/domain/services/identifier_key_transformer.py +++ b/ingestify/domain/services/identifier_key_transformer.py @@ -1,3 +1,4 @@ +import hashlib from abc import ABC, abstractmethod from enum import Enum from typing import Callable, Optional, Union @@ -51,7 +52,12 @@ def __init__(self, bucket_size: int = None, bucket_count: int = None): def __call__(self, id_key_value: Union[str, int]) -> str: if self.bucket_count: - return str(int(id_key_value) % self.bucket_count) + try: + value = int(id_key_value) + except (ValueError, TypeError): + # String keys: use stable hash to distribute across buckets + value = int(hashlib.md5(str(id_key_value).encode()).hexdigest(), 16) + return str(value % self.bucket_count) elif self.bucket_size: bucket_start = int(id_key_value) // self.bucket_size * self.bucket_size bucket_end = bucket_start + self.bucket_size - 1 From 62e39f75b4f5402cfa35e76a959ccc70f4d7e8da Mon Sep 17 00:00:00 2001 From: Koen Vossen Date: Tue, 7 Apr 2026 10:34:23 +0200 Subject: [PATCH 2/3] URL-encode identifier values in file paths Prevents special characters, spaces, $, unicode etc. from causing issues in GCS/S3 paths. Simple values like integers stay readable. --- ingestify/domain/services/identifier_key_transformer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ingestify/domain/services/identifier_key_transformer.py b/ingestify/domain/services/identifier_key_transformer.py index 3e22c64..0684d76 100644 --- a/ingestify/domain/services/identifier_key_transformer.py +++ b/ingestify/domain/services/identifier_key_transformer.py @@ -1,5 +1,6 @@ import hashlib from abc import ABC, abstractmethod +from urllib.parse import quote from enum import Enum from typing import Callable, Optional, Union @@ -125,8 +126,9 @@ def to_path(self, provider: str, dataset_type: str, identifier: dict) -> str: suffix = transformation.transformation_type.value.lower() path_parts.append(f"{key}_{suffix}={transformed_value}") - # Append the original value (either standalone for identity or alongside transformed) - path_parts.append(f"{key}={value}") + # Append the original value (either standalone for identity or alongside transformed). + # URL-encode the value so special characters, spaces, etc. are safe in paths. + path_parts.append(f"{key}={quote(str(value), safe='')}") # Join the parts with `/` to form the full path return "/".join(path_parts) From edd666cad391480744bda86e7469e6a980a3f493 Mon Sep 17 00:00:00 2001 From: Koen Vossen Date: Tue, 7 Apr 2026 11:04:36 +0200 Subject: [PATCH 3/3] Support string keys in BucketTransformation with bucket_count MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Always hashes the string representation via MD5 for consistent bucketing — no special-casing for numeric-looking strings. bucket_size path (used for integer ranges) is unchanged. --- .../services/identifier_key_transformer.py | 26 +++++++++++++++---- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/ingestify/domain/services/identifier_key_transformer.py b/ingestify/domain/services/identifier_key_transformer.py index 0684d76..b4c939a 100644 --- a/ingestify/domain/services/identifier_key_transformer.py +++ b/ingestify/domain/services/identifier_key_transformer.py @@ -1,4 +1,6 @@ import hashlib +import re +import unicodedata from abc import ABC, abstractmethod from urllib.parse import quote from enum import Enum @@ -10,6 +12,7 @@ class TransformationType(Enum): IDENTITY = "IDENTITY" BUCKET = "BUCKET" + PREFIX = "PREFIX" RANGE = "RANGE" CUSTOM = "CUSTOM" @@ -32,6 +35,8 @@ def from_dict(cls, config: dict) -> "Transformation": type_ = config.pop("type") if type_ == "bucket": return BucketTransformation(**config) + elif type_ == "prefix": + return PrefixTransformation(**config) else: raise IngestifyError(f"Cannot build Transformation from {config}") @@ -53,11 +58,9 @@ def __init__(self, bucket_size: int = None, bucket_count: int = None): def __call__(self, id_key_value: Union[str, int]) -> str: if self.bucket_count: - try: - value = int(id_key_value) - except (ValueError, TypeError): - # String keys: use stable hash to distribute across buckets - value = int(hashlib.md5(str(id_key_value).encode()).hexdigest(), 16) + # Always hash the string representation so bucketing is consistent + # regardless of whether the value looks numeric or not. + value = int(hashlib.md5(str(id_key_value).encode()).hexdigest(), 16) return str(value % self.bucket_count) elif self.bucket_size: bucket_start = int(id_key_value) // self.bucket_size * self.bucket_size @@ -67,6 +70,19 @@ def __call__(self, id_key_value: Union[str, int]) -> str: raise IngestifyError("Invalid BucketTransformation") +class PrefixTransformation(Transformation): + transformation_type = TransformationType.PREFIX + + def __init__(self, length: int = 1): + self.length = length + + def __call__(self, id_key_value: Union[str, int]) -> str: + # Transliterate unicode (ü→u, é→e) then strip non-alphanumeric + text = unicodedata.normalize("NFKD", str(id_key_value).lower()) + cleaned = re.sub(r"[^a-z0-9]", "", text) + return cleaned[: self.length] if cleaned else "_" + + class IdentifierTransformer: def __init__(self): # Mapping of (provider, dataset_type, id_key) to the transformation