From b46974c774546d1a2637cf34f47a07fe39735405 Mon Sep 17 00:00:00 2001
From: Koen Vossen <info@koenvossen.nl>
Date: Tue, 7 Apr 2026 10:26:59 +0200
Subject: [PATCH 1/3] Support string keys in BucketTransformation with
 bucket_count

Uses MD5 hash for stable distribution when the value cannot be cast to
int. Integer values continue to use direct modulo.
---
 ingestify/domain/services/identifier_key_transformer.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/ingestify/domain/services/identifier_key_transformer.py b/ingestify/domain/services/identifier_key_transformer.py
index 0aa86b3..3e22c64 100644
--- a/ingestify/domain/services/identifier_key_transformer.py
+++ b/ingestify/domain/services/identifier_key_transformer.py
@@ -1,3 +1,4 @@
+import hashlib
 from abc import ABC, abstractmethod
 from enum import Enum
 from typing import Callable, Optional, Union
@@ -51,7 +52,12 @@ def __init__(self, bucket_size: int = None, bucket_count: int = None):
 
     def __call__(self, id_key_value: Union[str, int]) -> str:
         if self.bucket_count:
-            return str(int(id_key_value) % self.bucket_count)
+            try:
+                value = int(id_key_value)
+            except (ValueError, TypeError):
+                # String keys: use stable hash to distribute across buckets
+                value = int(hashlib.md5(str(id_key_value).encode()).hexdigest(), 16)
+            return str(value % self.bucket_count)
         elif self.bucket_size:
             bucket_start = int(id_key_value) // self.bucket_size * self.bucket_size
             bucket_end = bucket_start + self.bucket_size - 1

From 62e39f75b4f5402cfa35e76a959ccc70f4d7e8da Mon Sep 17 00:00:00 2001
From: Koen Vossen <info@koenvossen.nl>
Date: Tue, 7 Apr 2026 10:34:23 +0200
Subject: [PATCH 2/3] URL-encode identifier values in file paths

Prevents special characters, spaces, $, unicode etc. from causing
issues in GCS/S3 paths. Simple values like integers stay readable.
---
 ingestify/domain/services/identifier_key_transformer.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ingestify/domain/services/identifier_key_transformer.py b/ingestify/domain/services/identifier_key_transformer.py
index 3e22c64..0684d76 100644
--- a/ingestify/domain/services/identifier_key_transformer.py
+++ b/ingestify/domain/services/identifier_key_transformer.py
@@ -1,5 +1,6 @@
 import hashlib
 from abc import ABC, abstractmethod
+from urllib.parse import quote
 from enum import Enum
 from typing import Callable, Optional, Union
 
@@ -125,8 +126,9 @@ def to_path(self, provider: str, dataset_type: str, identifier: dict) -> str:
                 suffix = transformation.transformation_type.value.lower()
                 path_parts.append(f"{key}_{suffix}={transformed_value}")
 
-            # Append the original value (either standalone for identity or alongside transformed)
-            path_parts.append(f"{key}={value}")
+            # Append the original value (either standalone for identity or alongside transformed).
+            # URL-encode the value so special characters, spaces, etc. are safe in paths.
+            path_parts.append(f"{key}={quote(str(value), safe='')}")
 
         # Join the parts with `/` to form the full path
         return "/".join(path_parts)

From edd666cad391480744bda86e7469e6a980a3f493 Mon Sep 17 00:00:00 2001
From: Koen Vossen <info@koenvossen.nl>
Date: Tue, 7 Apr 2026 11:04:36 +0200
Subject: [PATCH 3/3] Support string keys in BucketTransformation with
 bucket_count
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Always hashes the string representation via MD5 for consistent
bucketing — no special-casing for numeric-looking strings.
bucket_size path (used for integer ranges) is unchanged.
---
 .../services/identifier_key_transformer.py    | 26 +++++++++++++++----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/ingestify/domain/services/identifier_key_transformer.py b/ingestify/domain/services/identifier_key_transformer.py
index 0684d76..b4c939a 100644
--- a/ingestify/domain/services/identifier_key_transformer.py
+++ b/ingestify/domain/services/identifier_key_transformer.py
@@ -1,4 +1,6 @@
 import hashlib
+import re
+import unicodedata
 from abc import ABC, abstractmethod
 from urllib.parse import quote
 from enum import Enum
@@ -10,6 +12,7 @@
 class TransformationType(Enum):
     IDENTITY = "IDENTITY"
     BUCKET = "BUCKET"
+    PREFIX = "PREFIX"
     RANGE = "RANGE"
     CUSTOM = "CUSTOM"
 
@@ -32,6 +35,8 @@ def from_dict(cls, config: dict) -> "Transformation":
         type_ = config.pop("type")
         if type_ == "bucket":
             return BucketTransformation(**config)
+        elif type_ == "prefix":
+            return PrefixTransformation(**config)
         else:
             raise IngestifyError(f"Cannot build Transformation from {config}")
 
@@ -53,11 +58,9 @@ def __init__(self, bucket_size: int = None, bucket_count: int = None):
 
     def __call__(self, id_key_value: Union[str, int]) -> str:
         if self.bucket_count:
-            try:
-                value = int(id_key_value)
-            except (ValueError, TypeError):
-                # String keys: use stable hash to distribute across buckets
-                value = int(hashlib.md5(str(id_key_value).encode()).hexdigest(), 16)
+            # Always hash the string representation so bucketing is consistent
+            # regardless of whether the value looks numeric or not.
+            value = int(hashlib.md5(str(id_key_value).encode()).hexdigest(), 16)
             return str(value % self.bucket_count)
         elif self.bucket_size:
             bucket_start = int(id_key_value) // self.bucket_size * self.bucket_size
@@ -67,6 +70,19 @@ def __call__(self, id_key_value: Union[str, int]) -> str:
             raise IngestifyError("Invalid BucketTransformation")
 
 
+class PrefixTransformation(Transformation):
+    transformation_type = TransformationType.PREFIX
+
+    def __init__(self, length: int = 1):
+        self.length = length
+
+    def __call__(self, id_key_value: Union[str, int]) -> str:
+        # Transliterate unicode (ü→u, é→e) then strip non-alphanumeric
+        text = unicodedata.normalize("NFKD", str(id_key_value).lower())
+        cleaned = re.sub(r"[^a-z0-9]", "", text)
+        return cleaned[: self.length] if cleaned else "_"
+
+
 class IdentifierTransformer:
     def __init__(self):
         # Mapping of (provider, dataset_type, id_key) to the transformation