From 69447c6503caadabb03a8f440881ea7938da1b20 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Thu, 26 Mar 2026 14:55:21 +0800
Subject: [PATCH 1/8] feat: add environment variables for dataset caching and
 subprocess control

Signed-off-by: Xin He <xin3.he@intel.com>
---
 auto_round/calib_dataset.py | 138 +++++++++++++++++++++++++++++++++++-
 auto_round/envs.py          |   4 ++
 docs/environments.md        |  19 +++++
 3 files changed, 159 insertions(+), 2 deletions(-)

diff --git a/auto_round/calib_dataset.py b/auto_round/calib_dataset.py
index b4bbf65b5..b6b4ec80f 100644
--- a/auto_round/calib_dataset.py
+++ b/auto_round/calib_dataset.py
@@ -12,9 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import hashlib
 import json
 import logging
+import multiprocessing
+import os
 import random
+import shutil
 import sys
 
 logging.getLogger("datasets").setLevel(logging.WARNING)
@@ -23,6 +27,7 @@
 from datasets import Dataset, Features, IterableDataset, Sequence, Value, concatenate_datasets, load_dataset
 from torch.utils.data import DataLoader
 
+from . import envs
 from .utils import is_local_path, logger
 
 CALIB_DATASETS = {}
@@ -641,8 +646,36 @@ def select_dataset(dataset, indices):
         return dataset
 
 
-def get_dataset(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, nsamples=512):
-    """Generate a dataset for calibration.
+_DATASET_CACHE_VERSION = "v1"
+
+
+def _compute_dataset_cache_key(tokenizer, seqlen, dataset_name, seed, nsamples):
+    """Compute a deterministic cache key for dataset preprocessing."""
+    parts = [
+        _DATASET_CACHE_VERSION,
+        getattr(tokenizer, "name_or_path", type(tokenizer).__name__),
+        str(getattr(tokenizer, "vocab_size", 0)),
+        str(seqlen),
+        dataset_name,
+        str(seed),
+        str(nsamples),
+    ]
+    return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16]
+
+
+def _get_dataset_cache_path(cache_key):
+    """Get the cache directory path for a given cache key."""
+    cache_root = envs.AR_DATASET_CACHE_DIR
+    return os.path.join(cache_root, cache_key)
+
+
+def _is_cache_valid(cache_path):
+    """Check if a dataset cache directory is valid and complete."""
+    return os.path.isdir(cache_path) and os.path.exists(os.path.join(cache_path, "_done"))
+
+
+def _get_dataset_impl(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, nsamples=512):
+    """Internal implementation: generate a dataset for calibration.
 
     Args:
         tokenizer (Tokenizer): The tokenizer to use for tokenization.
@@ -764,6 +797,23 @@ def concat_dataset_element(dataset):
         )
         if do_concat:
             dataset = concat_dataset_element(dataset)
+
+        # After .map() tokenization on the first run, the tokenizer produces
+        # large temporary Python objects (batch string copies, token-id lists)
+        # during processing.  Although the final Arrow result is already mmap'd
+        # from the cache file, those temporaries linger in the Python/glibc
+        # heap.  Force GC + malloc_trim to return those pages to the OS before
+        # the next memory-intensive steps (filter, cast).
+        import gc
+
+        gc.collect()
+        try:
+            import ctypes
+
+            ctypes.CDLL("libc.so.6").malloc_trim(0)
+        except Exception:
+            pass
+
         dataset = dataset.filter(filter_func)
         if name in data_lens:
             dataset = select_dataset(dataset, range(data_lens[name]))
@@ -780,6 +830,7 @@ def concat_dataset_element(dataset):
                 new_features[k] = v
 
         dataset = dataset.cast(Features(new_features))
+
         datasets.append(dataset)
 
     if len(datasets) == 1:
@@ -829,6 +880,89 @@ def concat_dataset_element(dataset):
     return dataset_final
 
 
+def _subprocess_worker(tokenizer, seqlen, dataset_name, seed, nsamples, cache_path):
+    """Worker function executed in a child process.
+
+    Runs the full dataset preprocessing pipeline and saves the result to disk.
+    When this process exits, all preprocessing memory is reclaimed by the OS.
+    """
+    dataset = _get_dataset_impl(tokenizer, seqlen, dataset_name, seed, nsamples)
+    # Clean up any partial cache from a previous failed run
+    if os.path.exists(cache_path):
+        shutil.rmtree(cache_path, ignore_errors=True)
+    os.makedirs(os.path.dirname(cache_path), exist_ok=True)
+    dataset.save_to_disk(cache_path)
+    # Write a marker file to indicate the cache is complete
+    with open(os.path.join(cache_path, "_done"), "w") as f:
+        f.write("done")
+
+
+def get_dataset(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, nsamples=512):
+    """Generate a dataset for calibration.
+
+    Uses a subprocess for preprocessing to ensure all temporary memory is fully
+    reclaimed by the OS when the subprocess exits. Results are cached to disk
+    so subsequent runs with the same parameters load instantly.
+
+    Set environment variable ``AR_DISABLE_DATASET_SUBPROCESS=1`` to disable
+    subprocess mode and run preprocessing in the main process.
+    Set environment variable ``AR_DATASET_CACHE_DIR`` to customize the cache
+    directory (default: ``~/.cache/auto_round/datasets/``).
+
+    Args:
+        tokenizer: The tokenizer to use for tokenization.
+        seqlen (int): The exact sequence length.
+        dataset_name (str, optional): Dataset name(s) separated by commas.
+        seed (int, optional): Random seed for reproducibility. Defaults to 42.
+        nsamples (int, optional): Total number of samples to include. Defaults to 512.
+
+    Returns:
+        Dataset: The processed dataset ready for calibration.
+    """
+    # Allow disabling subprocess mode via environment variable
+    if envs.AR_DISABLE_DATASET_SUBPROCESS:
+        return _get_dataset_impl(tokenizer, seqlen, dataset_name, seed, nsamples)
+
+    cache_key = _compute_dataset_cache_key(tokenizer, seqlen, dataset_name, seed, nsamples)
+    cache_path = _get_dataset_cache_path(cache_key)
+
+    # Check if valid cache exists
+    if _is_cache_valid(cache_path):
+        logger.info(f"Loading cached calibration dataset from {cache_path}")
+        return Dataset.load_from_disk(cache_path)
+
+    # Run preprocessing in a subprocess so all temporary memory is freed on exit
+    logger.info("Preprocessing calibration dataset in a subprocess to avoid memory leaks...")
+
+    try:
+        if os.name == "nt":
+            raise OSError("fork is not available on Windows")
+
+        ctx = multiprocessing.get_context("fork")
+        p = ctx.Process(
+            target=_subprocess_worker,
+            args=(tokenizer, seqlen, dataset_name, seed, nsamples, cache_path),
+        )
+        p.start()
+        p.join()
+
+        if p.exitcode != 0:
+            raise RuntimeError(f"Dataset preprocessing subprocess exited with code {p.exitcode}")
+
+        if not _is_cache_valid(cache_path):
+            raise RuntimeError("Dataset cache was not created successfully")
+
+        logger.info(f"Loading preprocessed calibration dataset from {cache_path}")
+        return Dataset.load_from_disk(cache_path)
+
+    except Exception as e:
+        logger.warning(f"Subprocess dataset preprocessing failed ({e}), falling back to in-process mode.")
+        # Clean up any partial cache
+        if os.path.exists(cache_path):
+            shutil.rmtree(cache_path, ignore_errors=True)
+        return _get_dataset_impl(tokenizer, seqlen, dataset_name, seed, nsamples)
+
+
 def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, nsamples=512):
     """Generate a DataLoader for calibration using specified parameters.
 
diff --git a/auto_round/envs.py b/auto_round/envs.py
index b4576dece..2ca1c4b6e 100644
--- a/auto_round/envs.py
+++ b/auto_round/envs.py
@@ -31,6 +31,10 @@
     in ["1", "true"],
     "AR_OMP_NUM_THREADS": lambda: os.getenv("AR_OMP_NUM_THREADS", None),
     "AR_DISABLE_OFFLOAD": lambda: os.getenv("AR_DISABLE_OFFLOAD", "0").lower() in ("1", "true", "yes"),
+    "AR_DISABLE_DATASET_SUBPROCESS": lambda: os.getenv("AR_DISABLE_DATASET_SUBPROCESS", "0").lower() in ("1", "true"),
+    "AR_DATASET_CACHE_DIR": lambda: os.getenv(
+        "AR_DATASET_CACHE_DIR", os.path.join(os.path.expanduser("~"), ".cache", "auto_round", "datasets")
+    ),
 }
 
 
diff --git a/docs/environments.md b/docs/environments.md
index 78e71eab7..0d57ec438 100644
--- a/docs/environments.md
+++ b/docs/environments.md
@@ -47,6 +47,25 @@ export AR_USE_MODELSCOPE=true
 export AR_WORK_SPACE=/path/to/custom/workspace
 ```
 
+### AR_DISABLE_DATASET_SUBPROCESS
+- **Description**: Disables the use of a subprocess for dataset preprocessing. By default, AutoRound uses a subprocess to ensure all temporary memory is reclaimed by the OS.
+- **Default**: `False`
+- **Valid Values**: `"1"`, `"true"` (case-insensitive) for disabling; any other value for enabling
+- **Usage**: Set this to run dataset preprocessing in the main process
+
+```bash
+export AR_DISABLE_DATASET_SUBPROCESS=true
+```
+
+### AR_DATASET_CACHE_DIR
+- **Description**: Sets the cache directory for preprocessed datasets.
+- **Default**: `"~/.cache/auto_round/datasets/"`
+- **Usage**: Specify a custom directory to store cached datasets
+
+```bash
+export AR_DATASET_CACHE_DIR=/path/to/custom/cache
+```
+
 ## Usage Examples
 
 ### Setting Environment Variables

From b33934bdb7e516656f6e02a5cd6f05c69e2764d7 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Thu, 26 Mar 2026 15:09:11 +0800
Subject: [PATCH 2/8] remove useless code

---
 auto_round/calib_dataset.py | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/auto_round/calib_dataset.py b/auto_round/calib_dataset.py
index b6b4ec80f..55f37c744 100644
--- a/auto_round/calib_dataset.py
+++ b/auto_round/calib_dataset.py
@@ -798,22 +798,6 @@ def concat_dataset_element(dataset):
         if do_concat:
             dataset = concat_dataset_element(dataset)
 
-        # After .map() tokenization on the first run, the tokenizer produces
-        # large temporary Python objects (batch string copies, token-id lists)
-        # during processing.  Although the final Arrow result is already mmap'd
-        # from the cache file, those temporaries linger in the Python/glibc
-        # heap.  Force GC + malloc_trim to return those pages to the OS before
-        # the next memory-intensive steps (filter, cast).
-        import gc
-
-        gc.collect()
-        try:
-            import ctypes
-
-            ctypes.CDLL("libc.so.6").malloc_trim(0)
-        except Exception:
-            pass
-
         dataset = dataset.filter(filter_func)
         if name in data_lens:
             dataset = select_dataset(dataset, range(data_lens[name]))
@@ -830,7 +814,6 @@ def concat_dataset_element(dataset):
                 new_features[k] = v
 
         dataset = dataset.cast(Features(new_features))
-
         datasets.append(dataset)
 
     if len(datasets) == 1:

From b28569776a196256106c45f7684c3c550d182fa7 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Thu, 26 Mar 2026 15:21:22 +0800
Subject: [PATCH 3/8] remove cache logic to reuse datasets packge

Signed-off-by: Xin He <xin3.he@intel.com>
---
 auto_round/calib_dataset.py | 85 ++++++++-----------------------------
 auto_round/envs.py          |  3 --
 docs/environments.md        |  9 ----
 3 files changed, 18 insertions(+), 79 deletions(-)

diff --git a/auto_round/calib_dataset.py b/auto_round/calib_dataset.py
index 55f37c744..bf7f1965a 100644
--- a/auto_round/calib_dataset.py
+++ b/auto_round/calib_dataset.py
@@ -12,13 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import hashlib
 import json
 import logging
 import multiprocessing
 import os
 import random
-import shutil
 import sys
 
 logging.getLogger("datasets").setLevel(logging.WARNING)
@@ -646,34 +644,6 @@ def select_dataset(dataset, indices):
         return dataset
 
 
-_DATASET_CACHE_VERSION = "v1"
-
-
-def _compute_dataset_cache_key(tokenizer, seqlen, dataset_name, seed, nsamples):
-    """Compute a deterministic cache key for dataset preprocessing."""
-    parts = [
-        _DATASET_CACHE_VERSION,
-        getattr(tokenizer, "name_or_path", type(tokenizer).__name__),
-        str(getattr(tokenizer, "vocab_size", 0)),
-        str(seqlen),
-        dataset_name,
-        str(seed),
-        str(nsamples),
-    ]
-    return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16]
-
-
-def _get_dataset_cache_path(cache_key):
-    """Get the cache directory path for a given cache key."""
-    cache_root = envs.AR_DATASET_CACHE_DIR
-    return os.path.join(cache_root, cache_key)
-
-
-def _is_cache_valid(cache_path):
-    """Check if a dataset cache directory is valid and complete."""
-    return os.path.isdir(cache_path) and os.path.exists(os.path.join(cache_path, "_done"))
-
-
 def _get_dataset_impl(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, nsamples=512):
     """Internal implementation: generate a dataset for calibration.
 
@@ -863,34 +833,28 @@ def concat_dataset_element(dataset):
     return dataset_final
 
 
-def _subprocess_worker(tokenizer, seqlen, dataset_name, seed, nsamples, cache_path):
+def _subprocess_worker(tokenizer, seqlen, dataset_name, seed, nsamples):
     """Worker function executed in a child process.
 
-    Runs the full dataset preprocessing pipeline and saves the result to disk.
-    When this process exits, all preprocessing memory is reclaimed by the OS.
+    Runs the full dataset preprocessing pipeline to warm up the HuggingFace
+    datasets cache. When this process exits, all temporary preprocessing
+    memory is reclaimed by the OS, while the cached results remain on disk
+    for the main process to reuse.
     """
-    dataset = _get_dataset_impl(tokenizer, seqlen, dataset_name, seed, nsamples)
-    # Clean up any partial cache from a previous failed run
-    if os.path.exists(cache_path):
-        shutil.rmtree(cache_path, ignore_errors=True)
-    os.makedirs(os.path.dirname(cache_path), exist_ok=True)
-    dataset.save_to_disk(cache_path)
-    # Write a marker file to indicate the cache is complete
-    with open(os.path.join(cache_path, "_done"), "w") as f:
-        f.write("done")
+    _get_dataset_impl(tokenizer, seqlen, dataset_name, seed, nsamples)
 
 
 def get_dataset(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, nsamples=512):
     """Generate a dataset for calibration.
 
     Uses a subprocess for preprocessing to ensure all temporary memory is fully
-    reclaimed by the OS when the subprocess exits. Results are cached to disk
-    so subsequent runs with the same parameters load instantly.
+    reclaimed by the OS when the subprocess exits.  The HuggingFace ``datasets``
+    library automatically caches intermediate results (e.g. ``.map()``,
+    ``.filter()``), so the main process can reload them cheaply after the
+    subprocess finishes.
 
     Set environment variable ``AR_DISABLE_DATASET_SUBPROCESS=1`` to disable
     subprocess mode and run preprocessing in the main process.
-    Set environment variable ``AR_DATASET_CACHE_DIR`` to customize the cache
-    directory (default: ``~/.cache/auto_round/datasets/``).
 
     Args:
         tokenizer: The tokenizer to use for tokenization.
@@ -906,15 +870,8 @@ def get_dataset(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, n
     if envs.AR_DISABLE_DATASET_SUBPROCESS:
         return _get_dataset_impl(tokenizer, seqlen, dataset_name, seed, nsamples)
 
-    cache_key = _compute_dataset_cache_key(tokenizer, seqlen, dataset_name, seed, nsamples)
-    cache_path = _get_dataset_cache_path(cache_key)
-
-    # Check if valid cache exists
-    if _is_cache_valid(cache_path):
-        logger.info(f"Loading cached calibration dataset from {cache_path}")
-        return Dataset.load_from_disk(cache_path)
-
-    # Run preprocessing in a subprocess so all temporary memory is freed on exit
+    # Run preprocessing in a subprocess so all temporary memory is freed on exit.
+    # The HuggingFace datasets cache is warmed up as a side effect.
     logger.info("Preprocessing calibration dataset in a subprocess to avoid memory leaks...")
 
     try:
@@ -923,8 +880,8 @@ def get_dataset(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, n
 
         ctx = multiprocessing.get_context("fork")
         p = ctx.Process(
-            target=_subprocess_worker,
-            args=(tokenizer, seqlen, dataset_name, seed, nsamples, cache_path),
+            target=_get_dataset_impl,
+            args=(tokenizer, seqlen, dataset_name, seed, nsamples),
         )
         p.start()
         p.join()
@@ -932,18 +889,12 @@ def get_dataset(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, n
         if p.exitcode != 0:
             raise RuntimeError(f"Dataset preprocessing subprocess exited with code {p.exitcode}")
 
-        if not _is_cache_valid(cache_path):
-            raise RuntimeError("Dataset cache was not created successfully")
-
-        logger.info(f"Loading preprocessed calibration dataset from {cache_path}")
-        return Dataset.load_from_disk(cache_path)
-
     except Exception as e:
         logger.warning(f"Subprocess dataset preprocessing failed ({e}), falling back to in-process mode.")
-        # Clean up any partial cache
-        if os.path.exists(cache_path):
-            shutil.rmtree(cache_path, ignore_errors=True)
-        return _get_dataset_impl(tokenizer, seqlen, dataset_name, seed, nsamples)
+
+    # (Re-)load the dataset in the main process.  When the subprocess
+    # succeeded the HF datasets cache makes this almost instant.
+    return _get_dataset_impl(tokenizer, seqlen, dataset_name, seed, nsamples)
 
 
 def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, nsamples=512):
diff --git a/auto_round/envs.py b/auto_round/envs.py
index 2ca1c4b6e..abf51e16f 100644
--- a/auto_round/envs.py
+++ b/auto_round/envs.py
@@ -32,9 +32,6 @@
     "AR_OMP_NUM_THREADS": lambda: os.getenv("AR_OMP_NUM_THREADS", None),
     "AR_DISABLE_OFFLOAD": lambda: os.getenv("AR_DISABLE_OFFLOAD", "0").lower() in ("1", "true", "yes"),
     "AR_DISABLE_DATASET_SUBPROCESS": lambda: os.getenv("AR_DISABLE_DATASET_SUBPROCESS", "0").lower() in ("1", "true"),
-    "AR_DATASET_CACHE_DIR": lambda: os.getenv(
-        "AR_DATASET_CACHE_DIR", os.path.join(os.path.expanduser("~"), ".cache", "auto_round", "datasets")
-    ),
 }
 
 
diff --git a/docs/environments.md b/docs/environments.md
index 0d57ec438..14c7e0182 100644
--- a/docs/environments.md
+++ b/docs/environments.md
@@ -57,15 +57,6 @@ export AR_WORK_SPACE=/path/to/custom/workspace
 export AR_DISABLE_DATASET_SUBPROCESS=true
 ```
 
-### AR_DATASET_CACHE_DIR
-- **Description**: Sets the cache directory for preprocessed datasets.
-- **Default**: `"~/.cache/auto_round/datasets/"`
-- **Usage**: Specify a custom directory to store cached datasets
-
-```bash
-export AR_DATASET_CACHE_DIR=/path/to/custom/cache
-```
-
 ## Usage Examples
 
 ### Setting Environment Variables

From ed3e940b5647ede94aabef09a0f38adc18def278 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Thu, 26 Mar 2026 13:39:30 +0800
Subject: [PATCH 4/8] fix hash failure

Signed-off-by: Xin He <xin3.he@intel.com>
---
 auto_round/calib_dataset.py | 56 ++++++++++++++++++++++++++++++++++---
 1 file changed, 52 insertions(+), 4 deletions(-)

diff --git a/auto_round/calib_dataset.py b/auto_round/calib_dataset.py
index bf7f1965a..aaa2db3a5 100644
--- a/auto_round/calib_dataset.py
+++ b/auto_round/calib_dataset.py
@@ -88,6 +88,30 @@ def apply_chat_template_to_samples(samples, tokenizer, seqlen, system_prompt=Non
     return example
 
 
+def _make_map_fingerprint(dataset, tokenizer, seqlen, apply_chat_template, system_prompt, text_key="text"):
+    """Compute a stable fingerprint for Dataset.map() calls.
+
+    datasets uses dill to serialize the transform function for cache fingerprinting.
+    HuggingFace tokenizer objects are not reliably serializable by dill, causing
+    a random hash to be used each run — which breaks caching entirely.
+
+    This function computes a deterministic fingerprint from stable string
+    identifiers (tokenizer name, seqlen, etc.) so that caching works correctly
+    and subsequent runs can load from disk instead of re-tokenizing in RAM.
+    """
+    import hashlib
+
+    parts = [
+        getattr(dataset, "_fingerprint", "no_fingerprint"),
+        getattr(tokenizer, "name_or_path", type(tokenizer).__name__),
+        str(seqlen),
+        str(apply_chat_template),
+        str(system_prompt),
+        text_key,
+    ]
+    return hashlib.md5("|".join(parts).encode()).hexdigest()
+
+
 def get_tokenizer_function(tokenizer, seqlen, apply_chat_template=False, system_prompt=None):
     """Returns a default tokenizer function.
 
@@ -157,7 +181,13 @@ def get_pile_dataset(
             logger.error(f"Failed to load the dataset: {error_message}")
         sys.exit(1)
     calib_dataset = calib_dataset.shuffle(seed=seed)
-    calib_dataset = calib_dataset.map(tokenizer_function, batched=True)
+    calib_dataset = calib_dataset.map(
+        tokenizer_function,
+        batched=True,
+        new_fingerprint=_make_map_fingerprint(
+            calib_dataset, tokenizer, seqlen, apply_chat_template, system_prompt, "text"
+        ),
+    )
 
     return calib_dataset
 
@@ -453,7 +483,13 @@ def default_tokenizer_function(examples):
 
     calib_dataset = load_dataset("madao33/new-title-chinese", split=split)
     calib_dataset = calib_dataset.shuffle(seed=seed)
-    calib_dataset = calib_dataset.map(tokenizer_function, batched=True)
+    calib_dataset = calib_dataset.map(
+        tokenizer_function,
+        batched=True,
+        new_fingerprint=_make_map_fingerprint(
+            calib_dataset, tokenizer, seqlen, apply_chat_template, system_prompt, "content"
+        ),
+    )
 
     return calib_dataset
 
@@ -505,7 +541,13 @@ def get_mbpp_dataset(
     import datasets
 
     calib_dataset = datasets.Dataset.from_list(samples)
-    calib_dataset = calib_dataset.map(tokenizer_function, batched=True)
+    calib_dataset = calib_dataset.map(
+        tokenizer_function,
+        batched=True,
+        new_fingerprint=_make_map_fingerprint(
+            calib_dataset, tokenizer, seqlen, apply_chat_template, system_prompt, "text"
+        ),
+    )
 
     return calib_dataset
 
@@ -574,7 +616,13 @@ def load_local_data(data_path):
     import datasets
 
     calib_dataset = datasets.Dataset.from_list(samples)
-    calib_dataset = calib_dataset.map(tokenizer_function, batched=True)
+    calib_dataset = calib_dataset.map(
+        tokenizer_function,
+        batched=True,
+        new_fingerprint=_make_map_fingerprint(
+            calib_dataset, tokenizer, seqlen, apply_chat_template, system_prompt, "text"
+        ),
+    )
     return calib_dataset
 
 

From d48c8ecd0f283764d61815ed0bc4508f4e710f36 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Thu, 26 Mar 2026 15:23:28 +0800
Subject: [PATCH 5/8] remove useless code

Signed-off-by: Xin He <xin3.he@intel.com>
---
 auto_round/calib_dataset.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/auto_round/calib_dataset.py b/auto_round/calib_dataset.py
index aaa2db3a5..5301a72f7 100644
--- a/auto_round/calib_dataset.py
+++ b/auto_round/calib_dataset.py
@@ -881,17 +881,6 @@ def concat_dataset_element(dataset):
     return dataset_final
 
 
-def _subprocess_worker(tokenizer, seqlen, dataset_name, seed, nsamples):
-    """Worker function executed in a child process.
-
-    Runs the full dataset preprocessing pipeline to warm up the HuggingFace
-    datasets cache. When this process exits, all temporary preprocessing
-    memory is reclaimed by the OS, while the cached results remain on disk
-    for the main process to reuse.
-    """
-    _get_dataset_impl(tokenizer, seqlen, dataset_name, seed, nsamples)
-
-
 def get_dataset(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, nsamples=512):
     """Generate a dataset for calibration.
 

From b4283884144792830592893164517616d9a5f725 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Fri, 27 Mar 2026 11:19:07 +0800
Subject: [PATCH 6/8] Update auto_round/calib_dataset.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 auto_round/calib_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/calib_dataset.py b/auto_round/calib_dataset.py
index 5301a72f7..5b48e814c 100644
--- a/auto_round/calib_dataset.py
+++ b/auto_round/calib_dataset.py
@@ -109,7 +109,7 @@ def _make_map_fingerprint(dataset, tokenizer, seqlen, apply_chat_template, syste
         str(system_prompt),
         text_key,
     ]
-    return hashlib.md5("|".join(parts).encode()).hexdigest()
+    return hashlib.sha256("|".join(parts).encode()).hexdigest()
 
 
 def get_tokenizer_function(tokenizer, seqlen, apply_chat_template=False, system_prompt=None):

From e2683a5495126843c2feab55e02bfbf3e6465e2c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 27 Mar 2026 03:20:17 +0000
Subject: [PATCH 7/8] docs: add Chinese translation for environments.md and add
 language links

Agent-Logs-Url: https://github.com/intel/auto-round/sessions/0c14c972-2687-4283-aee6-3017898d7e0e

Co-authored-by: xin3he <83260933+xin3he@users.noreply.github.com>
---
 docs/environments.md    |   2 +
 docs/environments_CN.md | 147 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 149 insertions(+)
 create mode 100644 docs/environments_CN.md

diff --git a/docs/environments.md b/docs/environments.md
index 14c7e0182..2622df05d 100644
--- a/docs/environments.md
+++ b/docs/environments.md
@@ -1,5 +1,7 @@
 # AutoRound Environment Variables Configuration
 
+English | [简体中文](./environments_CN.md)
+
 This document describes the environment variables used by AutoRound for configuration and their usage.
 
 ## Overview
diff --git a/docs/environments_CN.md b/docs/environments_CN.md
new file mode 100644
index 000000000..f605ab9a7
--- /dev/null
+++ b/docs/environments_CN.md
@@ -0,0 +1,147 @@
+# AutoRound 环境变量配置
+
+[English](./environments.md) | 简体中文
+
+本文档介绍 AutoRound 使用的环境变量及其配置说明。
+
+## 概述
+
+AutoRound 通过 `envs.py` 模块提供统一的环境变量管理系统，支持懒加载求值与程序化配置。
+
+## 可用环境变量
+
+### AR_LOG_LEVEL
+- **描述**：控制 AutoRound 默认日志级别
+- **默认值**：`"INFO"`
+- **有效值**：`"TRACE"`、`"DEBUG"`、`"INFO"`、`"WARNING"`、`"ERROR"`、`"CRITICAL"`
+- **用途**：通过设置该变量控制 AutoRound 的日志详细程度
+
+```bash
+export AR_LOG_LEVEL=DEBUG
+```
+
+### AR_ENABLE_COMPILE_PACKING
+- **描述**：启用编译打包优化
+- **默认值**：`False`（等价于 `"0"`）
+- **有效值**：`"1"`、`"true"`、`"yes"`（不区分大小写）表示启用；其他值表示禁用
+- **用途**：启用后可在将 FP4 张量打包为 `uint8` 时获得性能优化
+
+```bash
+export AR_ENABLE_COMPILE_PACKING=1
+```
+
+### AR_USE_MODELSCOPE
+- **描述**：控制是否使用 ModelScope 下载模型
+- **默认值**：`False`
+- **有效值**：`"1"`、`"true"`（不区分大小写）表示启用；其他值表示禁用
+- **用途**：启用后将使用 ModelScope 替代 Hugging Face Hub 下载模型
+
+```bash
+export AR_USE_MODELSCOPE=true
+```
+
+### AR_WORK_SPACE
+- **描述**：设置 AutoRound 操作的工作目录
+- **默认值**：`"ar_work_space"`
+- **用途**：指定 AutoRound 存储临时文件和输出结果的自定义目录
+
+```bash
+export AR_WORK_SPACE=/path/to/custom/workspace
+```
+
+### AR_DISABLE_DATASET_SUBPROCESS
+- **描述**：禁用子进程方式进行数据集预处理。默认情况下，AutoRound 使用子进程确保所有临时内存在进程退出后被操作系统回收。
+- **默认值**：`False`
+- **有效值**：`"1"`、`"true"`（不区分大小写）表示禁用子进程；其他值表示启用子进程
+- **用途**：设置后数据集预处理将在主进程中运行
+
+```bash
+export AR_DISABLE_DATASET_SUBPROCESS=true
+```
+
+## 使用示例
+
+### 设置环境变量
+
+#### 通过 Shell 命令
+```bash
+# 将日志级别设置为 DEBUG
+export AR_LOG_LEVEL=DEBUG
+
+# 启用编译打包
+export AR_ENABLE_COMPILE_PACKING=1
+
+# 使用 ModelScope 下载模型
+export AR_USE_MODELSCOPE=true
+
+# 设置自定义工作目录
+export AR_WORK_SPACE=/tmp/autoround_workspace
+```
+
+#### 通过 Python 代码
+```python
+from auto_round.envs import set_config
+
+# 同时配置多个环境变量
+set_config(
+    AR_LOG_LEVEL="DEBUG",
+    AR_USE_MODELSCOPE=True,
+    AR_ENABLE_COMPILE_PACKING=True,
+    AR_WORK_SPACE="/tmp/autoround_workspace",
+)
+```
+
+### 查看环境变量
+
+#### 通过 Python 代码
+```python
+from auto_round import envs
+
+# 访问环境变量（懒加载求值）
+log_level = envs.AR_LOG_LEVEL
+use_modelscope = envs.AR_USE_MODELSCOPE
+enable_packing = envs.AR_ENABLE_COMPILE_PACKING
+workspace = envs.AR_WORK_SPACE
+
+print(f"日志级别: {log_level}")
+print(f"使用 ModelScope: {use_modelscope}")
+print(f"启用编译打包: {enable_packing}")
+print(f"工作目录: {workspace}")
+```
+
+#### 检查变量是否显式设置
+```python
+from auto_round.envs import is_set
+
+# 检查环境变量是否被显式设置
+if is_set("AR_LOG_LEVEL"):
+    print("AR_LOG_LEVEL 已被显式设置")
+else:
+    print("AR_LOG_LEVEL 正在使用默认值")
+```
+
+### AR_DISABLE_OFFLOAD
+- **描述**：强制禁用 `OffloadManager` 中的权重卸载功能。在开发和调试时可跳过所有卸载/重载开销。
+- **默认值**：`False`（等价于 `"0"`）
+- **有效值**：`"1"`、`"true"`、`"yes"`（不区分大小写）表示禁用卸载；其他值保持默认行为
+- **用途**：设置后将完全绕过权重卸载
+
+```bash
+export AR_DISABLE_OFFLOAD=1
+```
+
+## 配置最佳实践
+
+1. **开发环境**：设置 `AR_LOG_LEVEL=TRACE` 或 `AR_LOG_LEVEL=DEBUG` 以获取详细日志
+2. **生产环境**：使用 `AR_LOG_LEVEL=WARNING` 或 `AR_LOG_LEVEL=ERROR` 减少日志噪声
+3. **中国用户**：建议设置 `AR_USE_MODELSCOPE=true` 以获得更好的模型下载速度
+4. **性能优化**：如有足够算力，可启用 `AR_ENABLE_COMPILE_PACKING=1`
+5. **自定义工作目录**：将 `AR_WORK_SPACE` 设置为磁盘空间充足的目录
+
+## 注意事项
+
+- 环境变量采用懒加载方式，仅在首次访问时读取
+- `set_config()` 函数提供了便捷的程序化多变量配置方式
+- `AR_USE_MODELSCOPE` 的布尔值会自动转换为适当的字符串表示
+- 所有环境变量名称区分大小写
+- 通过 `set_config()` 所做的修改将影响当前进程及其子进程

From 2dae9c99a98d188dbe800e3c7729d24cb6b3eeae Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Fri, 27 Mar 2026 11:27:38 +0800
Subject: [PATCH 8/8] update doc

Signed-off-by: Xin He <xin3.he@intel.com>
---
 docs/environments.md    | 20 ++++++++++----------
 docs/environments_CN.md | 20 ++++++++++----------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/docs/environments.md b/docs/environments.md
index 2622df05d..242d660d7 100644
--- a/docs/environments.md
+++ b/docs/environments.md
@@ -49,6 +49,16 @@ export AR_USE_MODELSCOPE=true
 export AR_WORK_SPACE=/path/to/custom/workspace
 ```
 
+### AR_DISABLE_OFFLOAD
+- **Description**: Forcibly disables the weight offloading feature in `OffloadManager`. Useful during development and debugging to skip all offload/reload overhead.
+- **Default**: `False` (equivalent to `"0"`)
+- **Valid Values**: `"1"`, `"true"`, `"yes"` (case-insensitive) for disabling offload; any other value keeps the default behavior
+- **Usage**: Set this to bypass offloading entirely
+
+```bash
+export AR_DISABLE_OFFLOAD=1
+```
+
 ### AR_DISABLE_DATASET_SUBPROCESS
 - **Description**: Disables the use of a subprocess for dataset preprocessing. By default, AutoRound uses a subprocess to ensure all temporary memory is reclaimed by the OS.
 - **Default**: `False`
@@ -120,16 +130,6 @@ else:
     print("AR_LOG_LEVEL is using default value")
 ```
 
-### AR_DISABLE_OFFLOAD
-- **Description**: Forcibly disables the weight offloading feature in `OffloadManager`. Useful during development and debugging to skip all offload/reload overhead.
-- **Default**: `False` (equivalent to `"0"`)
-- **Valid Values**: `"1"`, `"true"`, `"yes"` (case-insensitive) for disabling offload; any other value keeps the default behavior
-- **Usage**: Set this to bypass offloading entirely
-
-```bash
-export AR_DISABLE_OFFLOAD=1
-```
-
 ## Configuration Best Practices
 
 1. **Development Environment**: Set `AR_LOG_LEVEL=TRACE` or `AR_LOG_LEVEL=DEBUG` for detailed logging during development
diff --git a/docs/environments_CN.md b/docs/environments_CN.md
index f605ab9a7..7407354cf 100644
--- a/docs/environments_CN.md
+++ b/docs/environments_CN.md
@@ -49,6 +49,16 @@ export AR_USE_MODELSCOPE=true
 export AR_WORK_SPACE=/path/to/custom/workspace
 ```
 
+### AR_DISABLE_OFFLOAD
+- **描述**：强制禁用 `OffloadManager` 中的权重卸载功能。在开发和调试时可跳过所有卸载/重载开销。
+- **默认值**：`False`（等价于 `"0"`）
+- **有效值**：`"1"`、`"true"`、`"yes"`（不区分大小写）表示禁用卸载；其他值保持默认行为
+- **用途**：设置后将完全绕过权重卸载
+
+```bash
+export AR_DISABLE_OFFLOAD=1
+```
+
 ### AR_DISABLE_DATASET_SUBPROCESS
 - **描述**：禁用子进程方式进行数据集预处理。默认情况下，AutoRound 使用子进程确保所有临时内存在进程退出后被操作系统回收。
 - **默认值**：`False`
@@ -120,16 +130,6 @@ else:
     print("AR_LOG_LEVEL 正在使用默认值")
 ```
 
-### AR_DISABLE_OFFLOAD
-- **描述**：强制禁用 `OffloadManager` 中的权重卸载功能。在开发和调试时可跳过所有卸载/重载开销。
-- **默认值**：`False`（等价于 `"0"`）
-- **有效值**：`"1"`、`"true"`、`"yes"`（不区分大小写）表示禁用卸载；其他值保持默认行为
-- **用途**：设置后将完全绕过权重卸载
-
-```bash
-export AR_DISABLE_OFFLOAD=1
-```
-
 ## 配置最佳实践
 
 1. **开发环境**：设置 `AR_LOG_LEVEL=TRACE` 或 `AR_LOG_LEVEL=DEBUG` 以获取详细日志