PySport · koenvo · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026
diff --git a/ingestify/domain/services/identifier_key_transformer.py b/ingestify/domain/services/identifier_key_transformer.py
@@ -1,4 +1,8 @@
+import hashlib
+import re
+import unicodedata
 from abc import ABC, abstractmethod
+from urllib.parse import quote
 from enum import Enum
 from typing import Callable, Optional, Union
 
@@ -8,6 +12,7 @@
 class TransformationType(Enum):
     IDENTITY = "IDENTITY"
     BUCKET = "BUCKET"
+    PREFIX = "PREFIX"
     RANGE = "RANGE"
     CUSTOM = "CUSTOM"
 
@@ -30,6 +35,8 @@ def from_dict(cls, config: dict) -> "Transformation":
         type_ = config.pop("type")
         if type_ == "bucket":
             return BucketTransformation(**config)
+        elif type_ == "prefix":
+            return PrefixTransformation(**config)
         else:
             raise IngestifyError(f"Cannot build Transformation from {config}")
 
@@ -51,7 +58,10 @@ def __init__(self, bucket_size: int = None, bucket_count: int = None):
 
     def __call__(self, id_key_value: Union[str, int]) -> str:
         if self.bucket_count:
-            return str(int(id_key_value) % self.bucket_count)
+            # Always hash the string representation so bucketing is consistent
+            # regardless of whether the value looks numeric or not.
+            value = int(hashlib.md5(str(id_key_value).encode()).hexdigest(), 16)
+            return str(value % self.bucket_count)
         elif self.bucket_size:
             bucket_start = int(id_key_value) // self.bucket_size * self.bucket_size
             bucket_end = bucket_start + self.bucket_size - 1
@@ -60,6 +70,19 @@ def __call__(self, id_key_value: Union[str, int]) -> str:
             raise IngestifyError("Invalid BucketTransformation")
 
 
+class PrefixTransformation(Transformation):
+    transformation_type = TransformationType.PREFIX
+
+    def __init__(self, length: int = 1):
+        self.length = length
+
+    def __call__(self, id_key_value: Union[str, int]) -> str:
+        # Transliterate unicode (ü→u, é→e) then strip non-alphanumeric
+        text = unicodedata.normalize("NFKD", str(id_key_value).lower())
+        cleaned = re.sub(r"[^a-z0-9]", "", text)
+        return cleaned[: self.length] if cleaned else "_"
+
+
 class IdentifierTransformer:
     def __init__(self):
         # Mapping of (provider, dataset_type, id_key) to the transformation
@@ -119,8 +142,9 @@ def to_path(self, provider: str, dataset_type: str, identifier: dict) -> str:
                 suffix = transformation.transformation_type.value.lower()
                 path_parts.append(f"{key}_{suffix}={transformed_value}")
 
-            # Append the original value (either standalone for identity or alongside transformed)
-            path_parts.append(f"{key}={value}")
+            # Append the original value (either standalone for identity or alongside transformed).
+            # URL-encode the value so special characters, spaces, etc. are safe in paths.
+            path_parts.append(f"{key}={quote(str(value), safe='')}")
 
         # Join the parts with `/` to form the full path
         return "/".join(path_parts)