diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index fc8e8a2..08f6869 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -62,4 +62,4 @@ body: id: notes attributes: label: Additional context - description: CFG structure, HTML screenshots, logs, etc. \ No newline at end of file + description: CFG structure, HTML screenshots, logs, etc. diff --git a/.github/ISSUE_TEMPLATE/cfg_semantics.yml b/.github/ISSUE_TEMPLATE/cfg_semantics.yml index 32bdfe8..3e209d1 100644 --- a/.github/ISSUE_TEMPLATE/cfg_semantics.yml +++ b/.github/ISSUE_TEMPLATE/cfg_semantics.yml @@ -43,4 +43,4 @@ body: attributes: label: Desired CFG behavior validations: - required: true \ No newline at end of file + required: true diff --git a/.github/ISSUE_TEMPLATE/false_positive.yml b/.github/ISSUE_TEMPLATE/false_positive.yml index 83e0129..663a2d3 100644 --- a/.github/ISSUE_TEMPLATE/false_positive.yml +++ b/.github/ISSUE_TEMPLATE/false_positive.yml @@ -43,4 +43,4 @@ body: attributes: label: CFG-related? options: - - label: Control flow structure differs meaningfully \ No newline at end of file + - label: Control flow structure differs meaningfully diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml index d058a54..2607b7b 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.yml +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -43,4 +43,4 @@ body: - type: textarea id: alternatives attributes: - label: Alternatives considered \ No newline at end of file + label: Alternatives considered diff --git a/.github/actions/codeclone/README.md b/.github/actions/codeclone/README.md index 220eaec..1889dcd 100644 --- a/.github/actions/codeclone/README.md +++ b/.github/actions/codeclone/README.md @@ -8,4 +8,4 @@ Runs CodeClone to detect architectural code duplication in Python projects. - uses: orenlab/codeclone/.github/actions/codeclone@v1 with: path: . - fail-on-new: true \ No newline at end of file + fail-on-new: true diff --git a/.gitignore b/.gitignore index 8c0e115..3e551ca 100644 --- a/.gitignore +++ b/.gitignore @@ -32,4 +32,6 @@ htmlcov/ .DS_Store # Logs -*.log \ No newline at end of file +*.log + +.claude diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8609d0a..ff65838 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,19 +1,33 @@ +default_install_hook_types: [ pre-commit, pre-push ] + repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 + hooks: + - id: check-merge-conflict + - id: end-of-file-fixer + - id: trailing-whitespace + - id: check-added-large-files + - id: check-toml + - id: check-yaml + - repo: local hooks: - - id: ruff-check - name: Ruff (lint) - entry: ruff check . + - id: ruff-format + name: Ruff (format) + entry: ruff format . language: system pass_filenames: false types: [ python ] + stages: [ pre-commit ] - - id: ruff-format - name: Ruff (format) - entry: ruff format . + - id: ruff-check + name: Ruff (lint) + entry: ruff check . language: system pass_filenames: false types: [ python ] + stages: [ pre-commit ] - id: mypy name: Mypy @@ -21,6 +35,7 @@ repos: language: system pass_filenames: false types: [ python ] + stages: [ pre-commit ] - id: codeclone name: CodeClone @@ -28,4 +43,13 @@ repos: language: system pass_filenames: false args: [ ".", "--ci" ] - types: [ python ] \ No newline at end of file + types: [ python ] + stages: [ pre-commit ] + + - id: pytest + name: Pytest + entry: pytest -q + language: system + pass_filenames: false + types: [ python ] + stages: [ pre-push ] diff --git a/CHANGELOG.md b/CHANGELOG.md index 20ea8bf..3cbf54d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,28 @@ # Changelog +## [1.4.4] - 2026-03-14 + +### Performance + +- Optimized HTML snippet rendering hot path: + - file snippets now reuse cached full-file lines and slice ranges without + repeated full-file scans + - Pygments modules are loaded once per importer identity instead of + re-importing for each snippet +- Optimized block explainability range stats: + - replaced repeated full `ast.walk()` scans per range with a per-file + statement index + `bisect` window lookup + +### Tests + +- Preserved existing golden/contract behavior for `1.4.x` and kept report output + semantics unchanged while improving runtime overhead. + +### Contract Notes + +- No baseline/cache/report schema changes. +- No clone detection or fingerprint semantic changes. + ## [1.4.3] - 2026-03-03 ### Cache Contract @@ -328,7 +351,7 @@ codeclone . --update-baseline ### Overview -This release focuses on security hardening, robustness, and long-term maintainability. +This release focuses on security hardening, robustness, and long-term maintainability. No breaking API changes were introduced. The goal of this release is to provide users with a safe, deterministic, and CI-friendly @@ -336,49 +359,49 @@ tool suitable for security-sensitive and large-scale environments. ### Security & Robustness -- **Path Traversal Protection** +- **Path Traversal Protection** Implemented strict path validation to prevent scanning outside the project root or accessing sensitive system directories, including macOS `/private` paths. -- **Cache Integrity Protection** +- **Cache Integrity Protection** Added HMAC-SHA256 signing for cache files to prevent cache poisoning and detect tampering. -- **Parser Safety Limits** +- **Parser Safety Limits** Introduced AST parsing time limits to mitigate risks from pathological or adversarial inputs. -- **Resource Exhaustion Protection** +- **Resource Exhaustion Protection** Enforced a maximum file size limit (10MB) and a maximum file count per scan to prevent excessive memory or CPU usage. -- **Structured Error Handling** +- **Structured Error Handling** Introduced a dedicated exception hierarchy (`ParseError`, `CacheError`, etc.) and replaced broad exception handling with graceful, user-friendly failure reporting. ### Performance Improvements -- **Optimized AST Normalization** +- **Optimized AST Normalization** Replaced expensive `deepcopy` operations with in-place AST normalization, significantly reducing CPU and memory overhead. -- **Improved Memory Efficiency** +- **Improved Memory Efficiency** Added an LRU cache for file reading and optimized string concatenation during fingerprint generation. -- **HTML Report Memory Bounds** +- **HTML Report Memory Bounds** HTML reports now read only the required line ranges instead of entire files, reducing peak memory usage on large codebases. ### Architecture & Maintainability -- **Strict Type Safety** +- **Strict Type Safety** Migrated all optional typing to Python 3.10+ `| None` syntax and achieved 100% `mypy` strict compliance. -- **Modular CFG Design** +- **Modular CFG Design** Split CFG data structures and builder logic into separate modules (`cfg_model.py` and `cfg.py`) for improved clarity and extensibility. -- **Template Extraction** +- **Template Extraction** Extracted HTML templates into a dedicated `templates.py` module. - Added a `py.typed` marker for downstream type checkers. @@ -420,13 +443,13 @@ support for Python 3.10–3.14 across the test matrix. ### Fixed -- **CFG Exception Handling** +- **CFG Exception Handling** Fixed incorrect control-flow linking for `try`/`except` blocks. -- **Pattern Matching Support** +- **Pattern Matching Support** Added missing structural handling for `match`/`case` statements in the CFG. -- **Block Detection Scaling** +- **Block Detection Scaling** Made `MIN_LINE_DISTANCE` dynamic based on block size to improve clone detection accuracy across differently sized functions. @@ -436,7 +459,7 @@ support for Python 3.10–3.14 across the test matrix. ### BREAKING CHANGES -- **CLI Arguments** +- **CLI Arguments** Renamed output flags for brevity and consistency: - `--json-out` → `--json` - `--text-out` → `--text` diff --git a/LICENSE b/LICENSE index 8fe210f..994c5ef 100644 --- a/LICENSE +++ b/LICENSE @@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. \ No newline at end of file +SOFTWARE. diff --git a/README.md b/README.md index 14ea6dc..9a0ff5d 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ ![Baseline](https://img.shields.io/badge/baseline-versioned-green?style=flat-square) [![License](https://img.shields.io/pypi/l/codeclone.svg?style=flat-square)](LICENSE) -**CodeClone** is a Python code clone detector based on **normalized AST and Control Flow Graphs (CFG)**. +**CodeClone** is a Python code clone detector based on **normalized AST and Control Flow Graphs (CFG)**. It discovers architectural duplication and prevents new copy-paste from entering your codebase via CI. --- @@ -34,13 +34,13 @@ Unlike token-based tools, CodeClone compares **structure and control flow**, mak **Three Detection Levels:** -1. **Function clones (CFG fingerprint)** +1. **Function clones (CFG fingerprint)** Strong structural signal for cross-layer duplication -2. **Block clones (statement windows)** +2. **Block clones (statement windows)** Detects repeated local logic patterns -3. **Segment clones (report-only)** +3. **Segment clones (report-only)** Internal function repetition for explainability; not used for baseline gating **CI-Ready Features:** diff --git a/SECURITY.md b/SECURITY.md index de26567..0c52920 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -64,7 +64,7 @@ If you believe you have discovered a security vulnerability, **do not open a pub Please report it privately via email: -**Email:** `pytelemonbot@mail.ru` +**Email:** `pytelemonbot@mail.ru` **Subject:** `Security issue in CodeClone` When reporting a vulnerability, please include: diff --git a/codeclone/_html_snippets.py b/codeclone/_html_snippets.py index 6fc0c02..8205b17 100644 --- a/codeclone/_html_snippets.py +++ b/codeclone/_html_snippets.py @@ -14,6 +14,7 @@ from collections.abc import Iterable from dataclasses import dataclass from functools import lru_cache +from types import ModuleType from typing import NamedTuple, cast from .errors import FileProcessingError @@ -34,33 +35,19 @@ class _Snippet: class _FileCache: - __slots__ = ("_get_lines_impl", "maxsize") + __slots__ = ("_get_file_lines_impl", "maxsize") def __init__(self, maxsize: int = 128) -> None: self.maxsize = maxsize - self._get_lines_impl = lru_cache(maxsize=maxsize)(self._read_file_range) + self._get_file_lines_impl = lru_cache(maxsize=maxsize)(self._read_file_lines) @staticmethod - def _read_file_range( - filepath: str, start_line: int, end_line: int - ) -> tuple[str, ...]: - if start_line < 1: - start_line = 1 - if end_line < start_line: - return () - + def _read_file_lines(filepath: str) -> tuple[str, ...]: try: def _read_with_errors(errors: str) -> tuple[str, ...]: - lines: list[str] = [] with open(filepath, encoding="utf-8", errors=errors) as f: - for lineno, line in enumerate(f, start=1): - if lineno < start_line: - continue - if lineno > end_line: - break - lines.append(line.rstrip("\n")) - return tuple(lines) + return tuple(line.rstrip("\n") for line in f) try: return _read_with_errors("strict") @@ -72,7 +59,16 @@ def _read_with_errors(errors: str) -> tuple[str, ...]: def get_lines_range( self, filepath: str, start_line: int, end_line: int ) -> tuple[str, ...]: - return self._get_lines_impl(filepath, start_line, end_line) + if start_line < 1: + start_line = 1 + if end_line < start_line: + return () + lines = self._get_file_lines_impl(filepath) + start_index = start_line - 1 + if start_index >= len(lines): + return () + end_index = min(len(lines), end_line) + return lines[start_index:end_index] class _CacheInfo(NamedTuple): hits: int @@ -81,10 +77,30 @@ class _CacheInfo(NamedTuple): currsize: int def cache_info(self) -> _CacheInfo: - return cast(_FileCache._CacheInfo, self._get_lines_impl.cache_info()) + return cast(_FileCache._CacheInfo, self._get_file_lines_impl.cache_info()) -def _try_pygments(code: str) -> str | None: +_PYGMENTS_IMPORTER_ID: int | None = None +_PYGMENTS_API: tuple[ModuleType, ModuleType, ModuleType] | None = None + + +def _load_pygments_api() -> tuple[ModuleType, ModuleType, ModuleType] | None: + """ + Load pygments modules once per import-function identity. + + Tests monkeypatch `importlib.import_module`; tracking importer identity keeps + behavior deterministic and allows import-error branches to stay testable. + """ + global _PYGMENTS_IMPORTER_ID + global _PYGMENTS_API + + importer_id = id(importlib.import_module) + if importer_id != _PYGMENTS_IMPORTER_ID: + _PYGMENTS_IMPORTER_ID = importer_id + _PYGMENTS_API = None + if _PYGMENTS_API is not None: + return _PYGMENTS_API + try: pygments = importlib.import_module("pygments") formatters = importlib.import_module("pygments.formatters") @@ -92,6 +108,16 @@ def _try_pygments(code: str) -> str | None: except ImportError: return None + _PYGMENTS_API = (pygments, formatters, lexers) + return _PYGMENTS_API + + +def _try_pygments(code: str) -> str | None: + pygments_api = _load_pygments_api() + if pygments_api is None: + return None + pygments, formatters, lexers = pygments_api + highlight = pygments.highlight formatter_cls = formatters.HtmlFormatter lexer_cls = lexers.PythonLexer @@ -104,10 +130,10 @@ def _pygments_css(style_name: str) -> str: Returns CSS for pygments tokens. Scoped to `.codebox` to avoid leaking styles. If Pygments is not available or style missing, returns "". """ - try: - formatters = importlib.import_module("pygments.formatters") - except ImportError: + pygments_api = _load_pygments_api() + if pygments_api is None: return "" + _, formatters, _ = pygments_api try: formatter_cls = formatters.HtmlFormatter diff --git a/codeclone/_report_explain.py b/codeclone/_report_explain.py index ad26cc0..cb22179 100644 --- a/codeclone/_report_explain.py +++ b/codeclone/_report_explain.py @@ -9,6 +9,8 @@ from __future__ import annotations import ast +from bisect import bisect_left, bisect_right +from dataclasses import dataclass from pathlib import Path from ._report_explain_contract import ( @@ -23,6 +25,19 @@ from ._report_types import GroupItem, GroupMap +@dataclass(frozen=True, slots=True) +class _StatementRecord: + node: ast.stmt + start_line: int + end_line: int + start_col: int + end_col: int + type_name: str + + +_StatementIndex = tuple[tuple[_StatementRecord, ...], tuple[int, ...]] + + def _signature_parts(group_key: str) -> list[str]: return [part for part in group_key.split("|") if part] @@ -42,6 +57,53 @@ def _parsed_file_tree( return tree +def _build_statement_index(tree: ast.AST) -> _StatementIndex: + records = tuple( + sorted( + ( + _StatementRecord( + node=node, + start_line=int(getattr(node, "lineno", 0)), + end_line=int(getattr(node, "end_lineno", 0)), + start_col=int(getattr(node, "col_offset", 0)), + end_col=int(getattr(node, "end_col_offset", 0)), + type_name=type(node).__name__, + ) + for node in ast.walk(tree) + if isinstance(node, ast.stmt) + ), + key=lambda record: ( + record.start_line, + record.end_line, + record.start_col, + record.end_col, + record.type_name, + ), + ) + ) + start_lines = tuple(record.start_line for record in records) + return records, start_lines + + +def _parsed_statement_index( + filepath: str, + *, + ast_cache: dict[str, ast.AST | None], + stmt_index_cache: dict[str, _StatementIndex | None], +) -> _StatementIndex | None: + if filepath in stmt_index_cache: + return stmt_index_cache[filepath] + + tree = _parsed_file_tree(filepath, ast_cache=ast_cache) + if tree is None: + stmt_index_cache[filepath] = None + return None + + index = _build_statement_index(tree) + stmt_index_cache[filepath] = index + return index + + def _is_assert_like_stmt(stmt: ast.stmt) -> bool: if isinstance(stmt, ast.Assert): return True @@ -64,45 +126,42 @@ def _assert_range_stats( start_line: int, end_line: int, ast_cache: dict[str, ast.AST | None], + stmt_index_cache: dict[str, _StatementIndex | None], range_cache: dict[tuple[str, int, int], tuple[int, int, int]], ) -> tuple[int, int, int]: cache_key = (filepath, start_line, end_line) if cache_key in range_cache: return range_cache[cache_key] - tree = _parsed_file_tree(filepath, ast_cache=ast_cache) - if tree is None: + statement_index = _parsed_statement_index( + filepath, + ast_cache=ast_cache, + stmt_index_cache=stmt_index_cache, + ) + if statement_index is None: range_cache[cache_key] = (0, 0, 0) return 0, 0, 0 - stmts = [ - node - for node in ast.walk(tree) - if isinstance(node, ast.stmt) - and int(getattr(node, "lineno", 0)) >= start_line - and int(getattr(node, "end_lineno", 0)) <= end_line - ] - if not stmts: + records, start_lines = statement_index + if not records: range_cache[cache_key] = (0, 0, 0) return 0, 0, 0 - ordered_stmts = sorted( - stmts, - key=lambda stmt: ( - int(getattr(stmt, "lineno", 0)), - int(getattr(stmt, "end_lineno", 0)), - int(getattr(stmt, "col_offset", 0)), - int(getattr(stmt, "end_col_offset", 0)), - type(stmt).__name__, - ), - ) + left = bisect_left(start_lines, start_line) + right = bisect_right(start_lines, end_line) + if left >= right: + range_cache[cache_key] = (0, 0, 0) + return 0, 0, 0 - total = len(ordered_stmts) + total = 0 assert_like = 0 max_consecutive = 0 current_consecutive = 0 - for stmt in ordered_stmts: - if _is_assert_like_stmt(stmt): + for record in records[left:right]: + if record.end_line > end_line: + continue + total += 1 + if _is_assert_like_stmt(record.node): assert_like += 1 current_consecutive += 1 if current_consecutive > max_consecutive: @@ -110,6 +169,10 @@ def _assert_range_stats( else: current_consecutive = 0 + if total == 0: + range_cache[cache_key] = (0, 0, 0) + return 0, 0, 0 + stats = (total, assert_like, max_consecutive) range_cache[cache_key] = stats return stats @@ -121,6 +184,7 @@ def _is_assert_only_range( start_line: int, end_line: int, ast_cache: dict[str, ast.AST | None], + stmt_index_cache: dict[str, _StatementIndex | None], range_cache: dict[tuple[str, int, int], tuple[int, int, int]], ) -> bool: total, assert_like, _ = _assert_range_stats( @@ -128,6 +192,7 @@ def _is_assert_only_range( start_line=start_line, end_line=end_line, ast_cache=ast_cache, + stmt_index_cache=stmt_index_cache, range_cache=range_cache, ) return total > 0 and total == assert_like @@ -157,6 +222,7 @@ def _enrich_with_assert_facts( facts: dict[str, str], items: list[GroupItem], ast_cache: dict[str, ast.AST | None], + stmt_index_cache: dict[str, _StatementIndex | None], range_cache: dict[tuple[str, int, int], tuple[int, int, int]], ) -> None: assert_only = True @@ -181,6 +247,7 @@ def _enrich_with_assert_facts( start_line=start_line, end_line=end_line, ast_cache=ast_cache, + stmt_index_cache=stmt_index_cache, range_cache=range_cache, ) total_statements += range_total @@ -198,6 +265,7 @@ def _enrich_with_assert_facts( start_line=start_line, end_line=end_line, ast_cache=ast_cache, + stmt_index_cache=stmt_index_cache, range_cache=range_cache, ) ): @@ -223,6 +291,7 @@ def build_block_group_facts(block_groups: GroupMap) -> dict[str, dict[str, str]] Renderers (HTML/TXT/JSON) should only display these facts. """ ast_cache: dict[str, ast.AST | None] = {} + stmt_index_cache: dict[str, _StatementIndex | None] = {} range_cache: dict[tuple[str, int, int], tuple[int, int, int]] = {} facts_by_group: dict[str, dict[str, str]] = {} @@ -232,6 +301,7 @@ def build_block_group_facts(block_groups: GroupMap) -> dict[str, dict[str, str]] facts=facts, items=items, ast_cache=ast_cache, + stmt_index_cache=stmt_index_cache, range_cache=range_cache, ) group_arity = len(items) diff --git a/docs/architecture.md b/docs/architecture.md index e6686b6..87ff40b 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -1,6 +1,6 @@ # CodeClone Architecture -> Scope note: this file is an architecture narrative/deep-dive. +> Scope note: this file is an architecture narrative/deep-dive. > Contract-level guarantees (schemas, statuses, exit codes, trust model, determinism) are defined in `docs/book/`. This document describes the high-level architecture of **CodeClone**. diff --git a/docs/book/README.md b/docs/book/README.md index cb52a7b..6b06475 100644 --- a/docs/book/README.md +++ b/docs/book/README.md @@ -42,4 +42,4 @@ If a statement is not enforced by code/tests, it is explicitly marked as non-con - [appendix/a-status-enums.md](appendix/a-status-enums.md) - [appendix/b-schema-layouts.md](appendix/b-schema-layouts.md) -- [appendix/c-error-catalog.md](appendix/c-error-catalog.md) \ No newline at end of file +- [appendix/c-error-catalog.md](appendix/c-error-catalog.md) diff --git a/docs/cfg.md b/docs/cfg.md index 7ed2a6b..4fb9500 100644 --- a/docs/cfg.md +++ b/docs/cfg.md @@ -1,6 +1,6 @@ # Control Flow Graph (CFG) — Design and Semantics -> Scope note: this file is a CFG deep-dive. +> Scope note: this file is a CFG deep-dive. > Contract-level guarantees are documented in `docs/book/` (especially `05-core-pipeline.md` and `12-determinism.md`). This document describes the **Control Flow Graph (CFG)** model used by **CodeClone**,