From 69447c6503caadabb03a8f440881ea7938da1b20 Mon Sep 17 00:00:00 2001 From: Xin He Date: Thu, 26 Mar 2026 14:55:21 +0800 Subject: [PATCH 1/8] feat: add environment variables for dataset caching and subprocess control Signed-off-by: Xin He --- auto_round/calib_dataset.py | 138 +++++++++++++++++++++++++++++++++++- auto_round/envs.py | 4 ++ docs/environments.md | 19 +++++ 3 files changed, 159 insertions(+), 2 deletions(-) diff --git a/auto_round/calib_dataset.py b/auto_round/calib_dataset.py index b4bbf65b5..b6b4ec80f 100644 --- a/auto_round/calib_dataset.py +++ b/auto_round/calib_dataset.py @@ -12,9 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import hashlib import json import logging +import multiprocessing +import os import random +import shutil import sys logging.getLogger("datasets").setLevel(logging.WARNING) @@ -23,6 +27,7 @@ from datasets import Dataset, Features, IterableDataset, Sequence, Value, concatenate_datasets, load_dataset from torch.utils.data import DataLoader +from . import envs from .utils import is_local_path, logger CALIB_DATASETS = {} @@ -641,8 +646,36 @@ def select_dataset(dataset, indices): return dataset -def get_dataset(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, nsamples=512): - """Generate a dataset for calibration. +_DATASET_CACHE_VERSION = "v1" + + +def _compute_dataset_cache_key(tokenizer, seqlen, dataset_name, seed, nsamples): + """Compute a deterministic cache key for dataset preprocessing.""" + parts = [ + _DATASET_CACHE_VERSION, + getattr(tokenizer, "name_or_path", type(tokenizer).__name__), + str(getattr(tokenizer, "vocab_size", 0)), + str(seqlen), + dataset_name, + str(seed), + str(nsamples), + ] + return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16] + + +def _get_dataset_cache_path(cache_key): + """Get the cache directory path for a given cache key.""" + cache_root = envs.AR_DATASET_CACHE_DIR + return os.path.join(cache_root, cache_key) + + +def _is_cache_valid(cache_path): + """Check if a dataset cache directory is valid and complete.""" + return os.path.isdir(cache_path) and os.path.exists(os.path.join(cache_path, "_done")) + + +def _get_dataset_impl(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, nsamples=512): + """Internal implementation: generate a dataset for calibration. Args: tokenizer (Tokenizer): The tokenizer to use for tokenization. @@ -764,6 +797,23 @@ def concat_dataset_element(dataset): ) if do_concat: dataset = concat_dataset_element(dataset) + + # After .map() tokenization on the first run, the tokenizer produces + # large temporary Python objects (batch string copies, token-id lists) + # during processing. Although the final Arrow result is already mmap'd + # from the cache file, those temporaries linger in the Python/glibc + # heap. Force GC + malloc_trim to return those pages to the OS before + # the next memory-intensive steps (filter, cast). + import gc + + gc.collect() + try: + import ctypes + + ctypes.CDLL("libc.so.6").malloc_trim(0) + except Exception: + pass + dataset = dataset.filter(filter_func) if name in data_lens: dataset = select_dataset(dataset, range(data_lens[name])) @@ -780,6 +830,7 @@ def concat_dataset_element(dataset): new_features[k] = v dataset = dataset.cast(Features(new_features)) + datasets.append(dataset) if len(datasets) == 1: @@ -829,6 +880,89 @@ def concat_dataset_element(dataset): return dataset_final +def _subprocess_worker(tokenizer, seqlen, dataset_name, seed, nsamples, cache_path): + """Worker function executed in a child process. + + Runs the full dataset preprocessing pipeline and saves the result to disk. + When this process exits, all preprocessing memory is reclaimed by the OS. + """ + dataset = _get_dataset_impl(tokenizer, seqlen, dataset_name, seed, nsamples) + # Clean up any partial cache from a previous failed run + if os.path.exists(cache_path): + shutil.rmtree(cache_path, ignore_errors=True) + os.makedirs(os.path.dirname(cache_path), exist_ok=True) + dataset.save_to_disk(cache_path) + # Write a marker file to indicate the cache is complete + with open(os.path.join(cache_path, "_done"), "w") as f: + f.write("done") + + +def get_dataset(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, nsamples=512): + """Generate a dataset for calibration. + + Uses a subprocess for preprocessing to ensure all temporary memory is fully + reclaimed by the OS when the subprocess exits. Results are cached to disk + so subsequent runs with the same parameters load instantly. + + Set environment variable ``AR_DISABLE_DATASET_SUBPROCESS=1`` to disable + subprocess mode and run preprocessing in the main process. + Set environment variable ``AR_DATASET_CACHE_DIR`` to customize the cache + directory (default: ``~/.cache/auto_round/datasets/``). + + Args: + tokenizer: The tokenizer to use for tokenization. + seqlen (int): The exact sequence length. + dataset_name (str, optional): Dataset name(s) separated by commas. + seed (int, optional): Random seed for reproducibility. Defaults to 42. + nsamples (int, optional): Total number of samples to include. Defaults to 512. + + Returns: + Dataset: The processed dataset ready for calibration. + """ + # Allow disabling subprocess mode via environment variable + if envs.AR_DISABLE_DATASET_SUBPROCESS: + return _get_dataset_impl(tokenizer, seqlen, dataset_name, seed, nsamples) + + cache_key = _compute_dataset_cache_key(tokenizer, seqlen, dataset_name, seed, nsamples) + cache_path = _get_dataset_cache_path(cache_key) + + # Check if valid cache exists + if _is_cache_valid(cache_path): + logger.info(f"Loading cached calibration dataset from {cache_path}") + return Dataset.load_from_disk(cache_path) + + # Run preprocessing in a subprocess so all temporary memory is freed on exit + logger.info("Preprocessing calibration dataset in a subprocess to avoid memory leaks...") + + try: + if os.name == "nt": + raise OSError("fork is not available on Windows") + + ctx = multiprocessing.get_context("fork") + p = ctx.Process( + target=_subprocess_worker, + args=(tokenizer, seqlen, dataset_name, seed, nsamples, cache_path), + ) + p.start() + p.join() + + if p.exitcode != 0: + raise RuntimeError(f"Dataset preprocessing subprocess exited with code {p.exitcode}") + + if not _is_cache_valid(cache_path): + raise RuntimeError("Dataset cache was not created successfully") + + logger.info(f"Loading preprocessed calibration dataset from {cache_path}") + return Dataset.load_from_disk(cache_path) + + except Exception as e: + logger.warning(f"Subprocess dataset preprocessing failed ({e}), falling back to in-process mode.") + # Clean up any partial cache + if os.path.exists(cache_path): + shutil.rmtree(cache_path, ignore_errors=True) + return _get_dataset_impl(tokenizer, seqlen, dataset_name, seed, nsamples) + + def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, nsamples=512): """Generate a DataLoader for calibration using specified parameters. diff --git a/auto_round/envs.py b/auto_round/envs.py index b4576dece..2ca1c4b6e 100644 --- a/auto_round/envs.py +++ b/auto_round/envs.py @@ -31,6 +31,10 @@ in ["1", "true"], "AR_OMP_NUM_THREADS": lambda: os.getenv("AR_OMP_NUM_THREADS", None), "AR_DISABLE_OFFLOAD": lambda: os.getenv("AR_DISABLE_OFFLOAD", "0").lower() in ("1", "true", "yes"), + "AR_DISABLE_DATASET_SUBPROCESS": lambda: os.getenv("AR_DISABLE_DATASET_SUBPROCESS", "0").lower() in ("1", "true"), + "AR_DATASET_CACHE_DIR": lambda: os.getenv( + "AR_DATASET_CACHE_DIR", os.path.join(os.path.expanduser("~"), ".cache", "auto_round", "datasets") + ), } diff --git a/docs/environments.md b/docs/environments.md index 78e71eab7..0d57ec438 100644 --- a/docs/environments.md +++ b/docs/environments.md @@ -47,6 +47,25 @@ export AR_USE_MODELSCOPE=true export AR_WORK_SPACE=/path/to/custom/workspace ``` +### AR_DISABLE_DATASET_SUBPROCESS +- **Description**: Disables the use of a subprocess for dataset preprocessing. By default, AutoRound uses a subprocess to ensure all temporary memory is reclaimed by the OS. +- **Default**: `False` +- **Valid Values**: `"1"`, `"true"` (case-insensitive) for disabling; any other value for enabling +- **Usage**: Set this to run dataset preprocessing in the main process + +```bash +export AR_DISABLE_DATASET_SUBPROCESS=true +``` + +### AR_DATASET_CACHE_DIR +- **Description**: Sets the cache directory for preprocessed datasets. +- **Default**: `"~/.cache/auto_round/datasets/"` +- **Usage**: Specify a custom directory to store cached datasets + +```bash +export AR_DATASET_CACHE_DIR=/path/to/custom/cache +``` + ## Usage Examples ### Setting Environment Variables From b33934bdb7e516656f6e02a5cd6f05c69e2764d7 Mon Sep 17 00:00:00 2001 From: Xin He Date: Thu, 26 Mar 2026 15:09:11 +0800 Subject: [PATCH 2/8] remove useless code --- auto_round/calib_dataset.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/auto_round/calib_dataset.py b/auto_round/calib_dataset.py index b6b4ec80f..55f37c744 100644 --- a/auto_round/calib_dataset.py +++ b/auto_round/calib_dataset.py @@ -798,22 +798,6 @@ def concat_dataset_element(dataset): if do_concat: dataset = concat_dataset_element(dataset) - # After .map() tokenization on the first run, the tokenizer produces - # large temporary Python objects (batch string copies, token-id lists) - # during processing. Although the final Arrow result is already mmap'd - # from the cache file, those temporaries linger in the Python/glibc - # heap. Force GC + malloc_trim to return those pages to the OS before - # the next memory-intensive steps (filter, cast). - import gc - - gc.collect() - try: - import ctypes - - ctypes.CDLL("libc.so.6").malloc_trim(0) - except Exception: - pass - dataset = dataset.filter(filter_func) if name in data_lens: dataset = select_dataset(dataset, range(data_lens[name])) @@ -830,7 +814,6 @@ def concat_dataset_element(dataset): new_features[k] = v dataset = dataset.cast(Features(new_features)) - datasets.append(dataset) if len(datasets) == 1: From b28569776a196256106c45f7684c3c550d182fa7 Mon Sep 17 00:00:00 2001 From: Xin He Date: Thu, 26 Mar 2026 15:21:22 +0800 Subject: [PATCH 3/8] remove cache logic to reuse datasets packge Signed-off-by: Xin He --- auto_round/calib_dataset.py | 85 ++++++++----------------------------- auto_round/envs.py | 3 -- docs/environments.md | 9 ---- 3 files changed, 18 insertions(+), 79 deletions(-) diff --git a/auto_round/calib_dataset.py b/auto_round/calib_dataset.py index 55f37c744..bf7f1965a 100644 --- a/auto_round/calib_dataset.py +++ b/auto_round/calib_dataset.py @@ -12,13 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import hashlib import json import logging import multiprocessing import os import random -import shutil import sys logging.getLogger("datasets").setLevel(logging.WARNING) @@ -646,34 +644,6 @@ def select_dataset(dataset, indices): return dataset -_DATASET_CACHE_VERSION = "v1" - - -def _compute_dataset_cache_key(tokenizer, seqlen, dataset_name, seed, nsamples): - """Compute a deterministic cache key for dataset preprocessing.""" - parts = [ - _DATASET_CACHE_VERSION, - getattr(tokenizer, "name_or_path", type(tokenizer).__name__), - str(getattr(tokenizer, "vocab_size", 0)), - str(seqlen), - dataset_name, - str(seed), - str(nsamples), - ] - return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16] - - -def _get_dataset_cache_path(cache_key): - """Get the cache directory path for a given cache key.""" - cache_root = envs.AR_DATASET_CACHE_DIR - return os.path.join(cache_root, cache_key) - - -def _is_cache_valid(cache_path): - """Check if a dataset cache directory is valid and complete.""" - return os.path.isdir(cache_path) and os.path.exists(os.path.join(cache_path, "_done")) - - def _get_dataset_impl(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, nsamples=512): """Internal implementation: generate a dataset for calibration. @@ -863,34 +833,28 @@ def concat_dataset_element(dataset): return dataset_final -def _subprocess_worker(tokenizer, seqlen, dataset_name, seed, nsamples, cache_path): +def _subprocess_worker(tokenizer, seqlen, dataset_name, seed, nsamples): """Worker function executed in a child process. - Runs the full dataset preprocessing pipeline and saves the result to disk. - When this process exits, all preprocessing memory is reclaimed by the OS. + Runs the full dataset preprocessing pipeline to warm up the HuggingFace + datasets cache. When this process exits, all temporary preprocessing + memory is reclaimed by the OS, while the cached results remain on disk + for the main process to reuse. """ - dataset = _get_dataset_impl(tokenizer, seqlen, dataset_name, seed, nsamples) - # Clean up any partial cache from a previous failed run - if os.path.exists(cache_path): - shutil.rmtree(cache_path, ignore_errors=True) - os.makedirs(os.path.dirname(cache_path), exist_ok=True) - dataset.save_to_disk(cache_path) - # Write a marker file to indicate the cache is complete - with open(os.path.join(cache_path, "_done"), "w") as f: - f.write("done") + _get_dataset_impl(tokenizer, seqlen, dataset_name, seed, nsamples) def get_dataset(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, nsamples=512): """Generate a dataset for calibration. Uses a subprocess for preprocessing to ensure all temporary memory is fully - reclaimed by the OS when the subprocess exits. Results are cached to disk - so subsequent runs with the same parameters load instantly. + reclaimed by the OS when the subprocess exits. The HuggingFace ``datasets`` + library automatically caches intermediate results (e.g. ``.map()``, + ``.filter()``), so the main process can reload them cheaply after the + subprocess finishes. Set environment variable ``AR_DISABLE_DATASET_SUBPROCESS=1`` to disable subprocess mode and run preprocessing in the main process. - Set environment variable ``AR_DATASET_CACHE_DIR`` to customize the cache - directory (default: ``~/.cache/auto_round/datasets/``). Args: tokenizer: The tokenizer to use for tokenization. @@ -906,15 +870,8 @@ def get_dataset(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, n if envs.AR_DISABLE_DATASET_SUBPROCESS: return _get_dataset_impl(tokenizer, seqlen, dataset_name, seed, nsamples) - cache_key = _compute_dataset_cache_key(tokenizer, seqlen, dataset_name, seed, nsamples) - cache_path = _get_dataset_cache_path(cache_key) - - # Check if valid cache exists - if _is_cache_valid(cache_path): - logger.info(f"Loading cached calibration dataset from {cache_path}") - return Dataset.load_from_disk(cache_path) - - # Run preprocessing in a subprocess so all temporary memory is freed on exit + # Run preprocessing in a subprocess so all temporary memory is freed on exit. + # The HuggingFace datasets cache is warmed up as a side effect. logger.info("Preprocessing calibration dataset in a subprocess to avoid memory leaks...") try: @@ -923,8 +880,8 @@ def get_dataset(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, n ctx = multiprocessing.get_context("fork") p = ctx.Process( - target=_subprocess_worker, - args=(tokenizer, seqlen, dataset_name, seed, nsamples, cache_path), + target=_get_dataset_impl, + args=(tokenizer, seqlen, dataset_name, seed, nsamples), ) p.start() p.join() @@ -932,18 +889,12 @@ def get_dataset(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, n if p.exitcode != 0: raise RuntimeError(f"Dataset preprocessing subprocess exited with code {p.exitcode}") - if not _is_cache_valid(cache_path): - raise RuntimeError("Dataset cache was not created successfully") - - logger.info(f"Loading preprocessed calibration dataset from {cache_path}") - return Dataset.load_from_disk(cache_path) - except Exception as e: logger.warning(f"Subprocess dataset preprocessing failed ({e}), falling back to in-process mode.") - # Clean up any partial cache - if os.path.exists(cache_path): - shutil.rmtree(cache_path, ignore_errors=True) - return _get_dataset_impl(tokenizer, seqlen, dataset_name, seed, nsamples) + + # (Re-)load the dataset in the main process. When the subprocess + # succeeded the HF datasets cache makes this almost instant. + return _get_dataset_impl(tokenizer, seqlen, dataset_name, seed, nsamples) def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, nsamples=512): diff --git a/auto_round/envs.py b/auto_round/envs.py index 2ca1c4b6e..abf51e16f 100644 --- a/auto_round/envs.py +++ b/auto_round/envs.py @@ -32,9 +32,6 @@ "AR_OMP_NUM_THREADS": lambda: os.getenv("AR_OMP_NUM_THREADS", None), "AR_DISABLE_OFFLOAD": lambda: os.getenv("AR_DISABLE_OFFLOAD", "0").lower() in ("1", "true", "yes"), "AR_DISABLE_DATASET_SUBPROCESS": lambda: os.getenv("AR_DISABLE_DATASET_SUBPROCESS", "0").lower() in ("1", "true"), - "AR_DATASET_CACHE_DIR": lambda: os.getenv( - "AR_DATASET_CACHE_DIR", os.path.join(os.path.expanduser("~"), ".cache", "auto_round", "datasets") - ), } diff --git a/docs/environments.md b/docs/environments.md index 0d57ec438..14c7e0182 100644 --- a/docs/environments.md +++ b/docs/environments.md @@ -57,15 +57,6 @@ export AR_WORK_SPACE=/path/to/custom/workspace export AR_DISABLE_DATASET_SUBPROCESS=true ``` -### AR_DATASET_CACHE_DIR -- **Description**: Sets the cache directory for preprocessed datasets. -- **Default**: `"~/.cache/auto_round/datasets/"` -- **Usage**: Specify a custom directory to store cached datasets - -```bash -export AR_DATASET_CACHE_DIR=/path/to/custom/cache -``` - ## Usage Examples ### Setting Environment Variables From ed3e940b5647ede94aabef09a0f38adc18def278 Mon Sep 17 00:00:00 2001 From: Xin He Date: Thu, 26 Mar 2026 13:39:30 +0800 Subject: [PATCH 4/8] fix hash failure Signed-off-by: Xin He --- auto_round/calib_dataset.py | 56 ++++++++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/auto_round/calib_dataset.py b/auto_round/calib_dataset.py index bf7f1965a..aaa2db3a5 100644 --- a/auto_round/calib_dataset.py +++ b/auto_round/calib_dataset.py @@ -88,6 +88,30 @@ def apply_chat_template_to_samples(samples, tokenizer, seqlen, system_prompt=Non return example +def _make_map_fingerprint(dataset, tokenizer, seqlen, apply_chat_template, system_prompt, text_key="text"): + """Compute a stable fingerprint for Dataset.map() calls. + + datasets uses dill to serialize the transform function for cache fingerprinting. + HuggingFace tokenizer objects are not reliably serializable by dill, causing + a random hash to be used each run — which breaks caching entirely. + + This function computes a deterministic fingerprint from stable string + identifiers (tokenizer name, seqlen, etc.) so that caching works correctly + and subsequent runs can load from disk instead of re-tokenizing in RAM. + """ + import hashlib + + parts = [ + getattr(dataset, "_fingerprint", "no_fingerprint"), + getattr(tokenizer, "name_or_path", type(tokenizer).__name__), + str(seqlen), + str(apply_chat_template), + str(system_prompt), + text_key, + ] + return hashlib.md5("|".join(parts).encode()).hexdigest() + + def get_tokenizer_function(tokenizer, seqlen, apply_chat_template=False, system_prompt=None): """Returns a default tokenizer function. @@ -157,7 +181,13 @@ def get_pile_dataset( logger.error(f"Failed to load the dataset: {error_message}") sys.exit(1) calib_dataset = calib_dataset.shuffle(seed=seed) - calib_dataset = calib_dataset.map(tokenizer_function, batched=True) + calib_dataset = calib_dataset.map( + tokenizer_function, + batched=True, + new_fingerprint=_make_map_fingerprint( + calib_dataset, tokenizer, seqlen, apply_chat_template, system_prompt, "text" + ), + ) return calib_dataset @@ -453,7 +483,13 @@ def default_tokenizer_function(examples): calib_dataset = load_dataset("madao33/new-title-chinese", split=split) calib_dataset = calib_dataset.shuffle(seed=seed) - calib_dataset = calib_dataset.map(tokenizer_function, batched=True) + calib_dataset = calib_dataset.map( + tokenizer_function, + batched=True, + new_fingerprint=_make_map_fingerprint( + calib_dataset, tokenizer, seqlen, apply_chat_template, system_prompt, "content" + ), + ) return calib_dataset @@ -505,7 +541,13 @@ def get_mbpp_dataset( import datasets calib_dataset = datasets.Dataset.from_list(samples) - calib_dataset = calib_dataset.map(tokenizer_function, batched=True) + calib_dataset = calib_dataset.map( + tokenizer_function, + batched=True, + new_fingerprint=_make_map_fingerprint( + calib_dataset, tokenizer, seqlen, apply_chat_template, system_prompt, "text" + ), + ) return calib_dataset @@ -574,7 +616,13 @@ def load_local_data(data_path): import datasets calib_dataset = datasets.Dataset.from_list(samples) - calib_dataset = calib_dataset.map(tokenizer_function, batched=True) + calib_dataset = calib_dataset.map( + tokenizer_function, + batched=True, + new_fingerprint=_make_map_fingerprint( + calib_dataset, tokenizer, seqlen, apply_chat_template, system_prompt, "text" + ), + ) return calib_dataset From d48c8ecd0f283764d61815ed0bc4508f4e710f36 Mon Sep 17 00:00:00 2001 From: Xin He Date: Thu, 26 Mar 2026 15:23:28 +0800 Subject: [PATCH 5/8] remove useless code Signed-off-by: Xin He --- auto_round/calib_dataset.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/auto_round/calib_dataset.py b/auto_round/calib_dataset.py index aaa2db3a5..5301a72f7 100644 --- a/auto_round/calib_dataset.py +++ b/auto_round/calib_dataset.py @@ -881,17 +881,6 @@ def concat_dataset_element(dataset): return dataset_final -def _subprocess_worker(tokenizer, seqlen, dataset_name, seed, nsamples): - """Worker function executed in a child process. - - Runs the full dataset preprocessing pipeline to warm up the HuggingFace - datasets cache. When this process exits, all temporary preprocessing - memory is reclaimed by the OS, while the cached results remain on disk - for the main process to reuse. - """ - _get_dataset_impl(tokenizer, seqlen, dataset_name, seed, nsamples) - - def get_dataset(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, nsamples=512): """Generate a dataset for calibration. From b4283884144792830592893164517616d9a5f725 Mon Sep 17 00:00:00 2001 From: Xin He Date: Fri, 27 Mar 2026 11:19:07 +0800 Subject: [PATCH 6/8] Update auto_round/calib_dataset.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- auto_round/calib_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/calib_dataset.py b/auto_round/calib_dataset.py index 5301a72f7..5b48e814c 100644 --- a/auto_round/calib_dataset.py +++ b/auto_round/calib_dataset.py @@ -109,7 +109,7 @@ def _make_map_fingerprint(dataset, tokenizer, seqlen, apply_chat_template, syste str(system_prompt), text_key, ] - return hashlib.md5("|".join(parts).encode()).hexdigest() + return hashlib.sha256("|".join(parts).encode()).hexdigest() def get_tokenizer_function(tokenizer, seqlen, apply_chat_template=False, system_prompt=None): From e2683a5495126843c2feab55e02bfbf3e6465e2c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 27 Mar 2026 03:20:17 +0000 Subject: [PATCH 7/8] docs: add Chinese translation for environments.md and add language links Agent-Logs-Url: https://github.com/intel/auto-round/sessions/0c14c972-2687-4283-aee6-3017898d7e0e Co-authored-by: xin3he <83260933+xin3he@users.noreply.github.com> --- docs/environments.md | 2 + docs/environments_CN.md | 147 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 docs/environments_CN.md diff --git a/docs/environments.md b/docs/environments.md index 14c7e0182..2622df05d 100644 --- a/docs/environments.md +++ b/docs/environments.md @@ -1,5 +1,7 @@ # AutoRound Environment Variables Configuration +English | [简体中文](./environments_CN.md) + This document describes the environment variables used by AutoRound for configuration and their usage. ## Overview diff --git a/docs/environments_CN.md b/docs/environments_CN.md new file mode 100644 index 000000000..f605ab9a7 --- /dev/null +++ b/docs/environments_CN.md @@ -0,0 +1,147 @@ +# AutoRound 环境变量配置 + +[English](./environments.md) | 简体中文 + +本文档介绍 AutoRound 使用的环境变量及其配置说明。 + +## 概述 + +AutoRound 通过 `envs.py` 模块提供统一的环境变量管理系统,支持懒加载求值与程序化配置。 + +## 可用环境变量 + +### AR_LOG_LEVEL +- **描述**:控制 AutoRound 默认日志级别 +- **默认值**:`"INFO"` +- **有效值**:`"TRACE"`、`"DEBUG"`、`"INFO"`、`"WARNING"`、`"ERROR"`、`"CRITICAL"` +- **用途**:通过设置该变量控制 AutoRound 的日志详细程度 + +```bash +export AR_LOG_LEVEL=DEBUG +``` + +### AR_ENABLE_COMPILE_PACKING +- **描述**:启用编译打包优化 +- **默认值**:`False`(等价于 `"0"`) +- **有效值**:`"1"`、`"true"`、`"yes"`(不区分大小写)表示启用;其他值表示禁用 +- **用途**:启用后可在将 FP4 张量打包为 `uint8` 时获得性能优化 + +```bash +export AR_ENABLE_COMPILE_PACKING=1 +``` + +### AR_USE_MODELSCOPE +- **描述**:控制是否使用 ModelScope 下载模型 +- **默认值**:`False` +- **有效值**:`"1"`、`"true"`(不区分大小写)表示启用;其他值表示禁用 +- **用途**:启用后将使用 ModelScope 替代 Hugging Face Hub 下载模型 + +```bash +export AR_USE_MODELSCOPE=true +``` + +### AR_WORK_SPACE +- **描述**:设置 AutoRound 操作的工作目录 +- **默认值**:`"ar_work_space"` +- **用途**:指定 AutoRound 存储临时文件和输出结果的自定义目录 + +```bash +export AR_WORK_SPACE=/path/to/custom/workspace +``` + +### AR_DISABLE_DATASET_SUBPROCESS +- **描述**:禁用子进程方式进行数据集预处理。默认情况下,AutoRound 使用子进程确保所有临时内存在进程退出后被操作系统回收。 +- **默认值**:`False` +- **有效值**:`"1"`、`"true"`(不区分大小写)表示禁用子进程;其他值表示启用子进程 +- **用途**:设置后数据集预处理将在主进程中运行 + +```bash +export AR_DISABLE_DATASET_SUBPROCESS=true +``` + +## 使用示例 + +### 设置环境变量 + +#### 通过 Shell 命令 +```bash +# 将日志级别设置为 DEBUG +export AR_LOG_LEVEL=DEBUG + +# 启用编译打包 +export AR_ENABLE_COMPILE_PACKING=1 + +# 使用 ModelScope 下载模型 +export AR_USE_MODELSCOPE=true + +# 设置自定义工作目录 +export AR_WORK_SPACE=/tmp/autoround_workspace +``` + +#### 通过 Python 代码 +```python +from auto_round.envs import set_config + +# 同时配置多个环境变量 +set_config( + AR_LOG_LEVEL="DEBUG", + AR_USE_MODELSCOPE=True, + AR_ENABLE_COMPILE_PACKING=True, + AR_WORK_SPACE="/tmp/autoround_workspace", +) +``` + +### 查看环境变量 + +#### 通过 Python 代码 +```python +from auto_round import envs + +# 访问环境变量(懒加载求值) +log_level = envs.AR_LOG_LEVEL +use_modelscope = envs.AR_USE_MODELSCOPE +enable_packing = envs.AR_ENABLE_COMPILE_PACKING +workspace = envs.AR_WORK_SPACE + +print(f"日志级别: {log_level}") +print(f"使用 ModelScope: {use_modelscope}") +print(f"启用编译打包: {enable_packing}") +print(f"工作目录: {workspace}") +``` + +#### 检查变量是否显式设置 +```python +from auto_round.envs import is_set + +# 检查环境变量是否被显式设置 +if is_set("AR_LOG_LEVEL"): + print("AR_LOG_LEVEL 已被显式设置") +else: + print("AR_LOG_LEVEL 正在使用默认值") +``` + +### AR_DISABLE_OFFLOAD +- **描述**:强制禁用 `OffloadManager` 中的权重卸载功能。在开发和调试时可跳过所有卸载/重载开销。 +- **默认值**:`False`(等价于 `"0"`) +- **有效值**:`"1"`、`"true"`、`"yes"`(不区分大小写)表示禁用卸载;其他值保持默认行为 +- **用途**:设置后将完全绕过权重卸载 + +```bash +export AR_DISABLE_OFFLOAD=1 +``` + +## 配置最佳实践 + +1. **开发环境**:设置 `AR_LOG_LEVEL=TRACE` 或 `AR_LOG_LEVEL=DEBUG` 以获取详细日志 +2. **生产环境**:使用 `AR_LOG_LEVEL=WARNING` 或 `AR_LOG_LEVEL=ERROR` 减少日志噪声 +3. **中国用户**:建议设置 `AR_USE_MODELSCOPE=true` 以获得更好的模型下载速度 +4. **性能优化**:如有足够算力,可启用 `AR_ENABLE_COMPILE_PACKING=1` +5. **自定义工作目录**:将 `AR_WORK_SPACE` 设置为磁盘空间充足的目录 + +## 注意事项 + +- 环境变量采用懒加载方式,仅在首次访问时读取 +- `set_config()` 函数提供了便捷的程序化多变量配置方式 +- `AR_USE_MODELSCOPE` 的布尔值会自动转换为适当的字符串表示 +- 所有环境变量名称区分大小写 +- 通过 `set_config()` 所做的修改将影响当前进程及其子进程 From 2dae9c99a98d188dbe800e3c7729d24cb6b3eeae Mon Sep 17 00:00:00 2001 From: Xin He Date: Fri, 27 Mar 2026 11:27:38 +0800 Subject: [PATCH 8/8] update doc Signed-off-by: Xin He --- docs/environments.md | 20 ++++++++++---------- docs/environments_CN.md | 20 ++++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/docs/environments.md b/docs/environments.md index 2622df05d..242d660d7 100644 --- a/docs/environments.md +++ b/docs/environments.md @@ -49,6 +49,16 @@ export AR_USE_MODELSCOPE=true export AR_WORK_SPACE=/path/to/custom/workspace ``` +### AR_DISABLE_OFFLOAD +- **Description**: Forcibly disables the weight offloading feature in `OffloadManager`. Useful during development and debugging to skip all offload/reload overhead. +- **Default**: `False` (equivalent to `"0"`) +- **Valid Values**: `"1"`, `"true"`, `"yes"` (case-insensitive) for disabling offload; any other value keeps the default behavior +- **Usage**: Set this to bypass offloading entirely + +```bash +export AR_DISABLE_OFFLOAD=1 +``` + ### AR_DISABLE_DATASET_SUBPROCESS - **Description**: Disables the use of a subprocess for dataset preprocessing. By default, AutoRound uses a subprocess to ensure all temporary memory is reclaimed by the OS. - **Default**: `False` @@ -120,16 +130,6 @@ else: print("AR_LOG_LEVEL is using default value") ``` -### AR_DISABLE_OFFLOAD -- **Description**: Forcibly disables the weight offloading feature in `OffloadManager`. Useful during development and debugging to skip all offload/reload overhead. -- **Default**: `False` (equivalent to `"0"`) -- **Valid Values**: `"1"`, `"true"`, `"yes"` (case-insensitive) for disabling offload; any other value keeps the default behavior -- **Usage**: Set this to bypass offloading entirely - -```bash -export AR_DISABLE_OFFLOAD=1 -``` - ## Configuration Best Practices 1. **Development Environment**: Set `AR_LOG_LEVEL=TRACE` or `AR_LOG_LEVEL=DEBUG` for detailed logging during development diff --git a/docs/environments_CN.md b/docs/environments_CN.md index f605ab9a7..7407354cf 100644 --- a/docs/environments_CN.md +++ b/docs/environments_CN.md @@ -49,6 +49,16 @@ export AR_USE_MODELSCOPE=true export AR_WORK_SPACE=/path/to/custom/workspace ``` +### AR_DISABLE_OFFLOAD +- **描述**:强制禁用 `OffloadManager` 中的权重卸载功能。在开发和调试时可跳过所有卸载/重载开销。 +- **默认值**:`False`(等价于 `"0"`) +- **有效值**:`"1"`、`"true"`、`"yes"`(不区分大小写)表示禁用卸载;其他值保持默认行为 +- **用途**:设置后将完全绕过权重卸载 + +```bash +export AR_DISABLE_OFFLOAD=1 +``` + ### AR_DISABLE_DATASET_SUBPROCESS - **描述**:禁用子进程方式进行数据集预处理。默认情况下,AutoRound 使用子进程确保所有临时内存在进程退出后被操作系统回收。 - **默认值**:`False` @@ -120,16 +130,6 @@ else: print("AR_LOG_LEVEL 正在使用默认值") ``` -### AR_DISABLE_OFFLOAD -- **描述**:强制禁用 `OffloadManager` 中的权重卸载功能。在开发和调试时可跳过所有卸载/重载开销。 -- **默认值**:`False`(等价于 `"0"`) -- **有效值**:`"1"`、`"true"`、`"yes"`(不区分大小写)表示禁用卸载;其他值保持默认行为 -- **用途**:设置后将完全绕过权重卸载 - -```bash -export AR_DISABLE_OFFLOAD=1 -``` - ## 配置最佳实践 1. **开发环境**:设置 `AR_LOG_LEVEL=TRACE` 或 `AR_LOG_LEVEL=DEBUG` 以获取详细日志