diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000..db19f01 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,54 @@ +name: Bug Report +description: Report incorrect behavior or numerical errors +labels: ["bug"] +body: + - type: textarea + id: description + attributes: + label: What happened? + description: Describe the bug. Include expected vs actual behavior. + placeholder: | + ts_mean([1, 2, 3], window=2) returns [None, None, 2.5] + Expected: [None, 1.5, 2.5] + validations: + required: true + + - type: textarea + id: reproduction + attributes: + label: Reproducible Example + render: python + placeholder: | + from elvers import load, ts_mean + + panel = load() + result = ts_mean(panel["close"], 5) + print(result.df) + validations: + required: true + + - type: dropdown + id: numerical + attributes: + label: Does this affect numerical output? + options: + - "Yes -- incorrect numerical values" + - "No -- crash, wrong type, or other non-numerical issue" + validations: + required: true + + - type: textarea + id: environment + attributes: + label: Environment + description: "Paste the output of: python -c \"import elvers; elvers.show_versions()\"" + render: text + placeholder: | + --------Elvers--------- + Elvers: 0.3.0 + Polars: 1.37.0 + Python: 3.12.0 + Platform: Linux-6.1.0-x86_64 + Architecture: x86_64 + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000..7879fd0 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,5 @@ +blank_issues_enabled: false +contact_links: + - name: Question + url: https://github.com/quantbai/elvers/discussions/new?category=q-a + about: Ask a question or start a discussion. diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 0000000..3feb4ac --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,26 @@ +name: Feature Request +description: Suggest an improvement or new capability +labels: ["enhancement"] +body: + - type: textarea + id: use_case + attributes: + label: Use Case + description: What problem does this solve? + placeholder: | + Computing returns requires verbose composition every time: + divide(ts_delta(close, 1), ts_delay(close, 1)) + validations: + required: true + + - type: textarea + id: proposed_api + attributes: + label: Proposed API + description: Show how it should work. + render: python + placeholder: | + from elvers import returns + ret = returns(close, window=1) + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/new_operator.yml b/.github/ISSUE_TEMPLATE/new_operator.yml new file mode 100644 index 0000000..06918d6 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/new_operator.yml @@ -0,0 +1,50 @@ +name: New Operator +description: Propose a new operator +labels: ["new operator"] +body: + - type: input + id: name + attributes: + label: Operator Name + description: "Naming: ts_ for time-series, group_ for group ops, no prefix for cross-sectional." + placeholder: "ts_entropy" + validations: + required: true + + - type: dropdown + id: module + attributes: + label: Module + options: + - "timeseries -- per-symbol rolling window" + - "cross_sectional -- across symbols per timestamp" + - "neutralization -- group operations" + - "math -- element-wise transforms" + - "base -- arithmetic and structural" + validations: + required: true + + - type: textarea + id: definition + attributes: + label: Definition and Use Case + description: Mathematical formula and why this operator is useful. + placeholder: | + H(X) = -sum(p_i * log(p_i)) + Measures randomness in a rolling window. Low entropy often precedes breakouts. + validations: + required: true + + - type: textarea + id: example + attributes: + label: Example + description: Input and expected output. + render: python + placeholder: | + from elvers import load + panel = load() + result = ts_entropy(panel["close"], window=3) + print(result.df.head()) + validations: + required: false diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 56a2c89..c17f58f 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -5,13 +5,34 @@ - [ ] Bug fix (corrects incorrect behavior) - [ ] New feature (new operator or functionality) - [ ] Refactor (no behavior change) -- [ ] Numerical change (alters factor computation results) [BREAKING] +- [ ] Documentation only +- [ ] Numerical change (alters factor computation results) **[BREAKING]** ## Numerical Impact - + + + ## Testing - [ ] Added or updated tests - [ ] All tests pass (`pytest tests/ -v`) - [ ] Lint passes (`ruff check elvers/`) + +## New Operator Checklist + +- [ ] Operator added to `ops/__init__.py` imports and `__all__` +- [ ] Docstring includes: description, parameters, return type, null behavior, warmup +- [ ] Divisions handle zero denominators (exact zero check or Inf → null via Factor) +- [ ] No Python loops in computation (Polars expressions only) +- [ ] Uses `ddof=0` for std/variance (population statistics) +- [ ] Uses `min_samples=window` for rolling operations +- [ ] Tests cover: basic correctness, null handling, edge cases +- [ ] `OPERATORS.md` updated with the new operator + +## Reviewer Notes + + + + + diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c97dbc..5431bc1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ Numerical changes are marked with [NUMERICAL]. ## [Unreleased] +### Changed +- [NUMERICAL] Removed arbitrary `1e-10` zero guards across all operators. Pure divisions (divide, inverse) now flow through Factor constructor (Inf → null). Statistical and regression operators use exact zero checks for degenerate cases (constant series). No more silent data loss from legitimate small values. + ## [0.3.0] - 2026-03-24 ### Fixed @@ -16,14 +19,14 @@ Numerical changes are marked with [NUMERICAL]. ### Changed - OPERATORS.md rewritten as pure operator reference manual (signatures, behavior, edge cases) -- Design rationale moved to CLAUDE.md Section 4.1 (developer-facing) +- Design rationale moved to CONTRIBUTING.md (Numerical Invariants section) - Fixed incorrect signatures in docs: `trade_when`, `scale`, `bucket` - Fixed README example code to use only columns present in sample data ## [0.2.0] - 2026-03-23 ### Added -- CLAUDE.md development standards (12 sections covering full workflow) +- CLAUDE.md development memo (architecture map, conduct rules, known limitations) - CI pipeline (GitHub Actions, pytest across Python 3.10-3.13) - Automated release pipeline (tag-triggered PyPI publish + GitHub Release) - Pre-commit hooks (ruff lint + format, pytest) @@ -31,7 +34,7 @@ Numerical changes are marked with [NUMERICAL]. - Dev dependencies (pytest, ruff, pre-commit) - `elvers/ops/_dev.py` for experimental operators - `elvers/ops/_validation.py` input validation helpers -- `load()` now accepts `interval` parameter for sub-daily data (e.g., "1h", "5m") +- `load()` supports any time frequency; panel skeleton built from timestamps present in data - Factor constructor validates required columns [timestamp, symbol, factor] - Tests for `divide()`, `reverse()`, `ts_product` (negative/zero), `ts_regression` (lag>0) @@ -42,7 +45,7 @@ Numerical changes are marked with [NUMERICAL]. - [NUMERICAL] `inverse()`: no zero protection; now returns null where abs(x) < 1e-10 - `ts_regression` rettype=7 (MSE): implicit Inf-to-null on window=2; now has explicit guard - `ts_count_nans`: used min_samples=1 unlike all other ts_* operators; aligned to min_samples=window -- `_balance()`: hardcoded daily frequency; now accepts interval parameter +- `_balance()`: replaced generated-range skeleton with union-based skeleton (no frequency inference) ### Changed - DEV operators (hump, ts_arg_max, ts_arg_min) moved from public API to `_dev.py` diff --git a/CLAUDE.md b/CLAUDE.md index acca1e2..1038130 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,334 +1,61 @@ -# Elvers Development Standards +# Elvers -- AI Development Memo -This document is the single source of truth for all development practices. -Any developer or LLM working on this project must follow these standards. +Production-grade factor computation for quantitative research. This memo is for AI assistants working on the codebase. + +For contribution workflow, numerical invariants, and coding standards, see [CONTRIBUTING.md](CONTRIBUTING.md). +For operator specifications, see [OPERATORS.md](OPERATORS.md). --- -## 1. Project Overview +## Conduct -Elvers is a Polars-native multi-factor computation library serving as the core factor -calculation layer for a crypto hedge fund. Production-grade correctness is non-negotiable. +**When uncertain, ask. Do not guess.** If you are making an assumption about intent, scope, API design, or anything you are not confident about -- stop and ask. Flag assumptions explicitly: "I am assuming X. Is that correct?" -- Language: Python 3.10+ -- Sole external dependency: polars>=1.37.0 -- Architecture: Panel (balanced container) -> Factor (immutable signal vector) -- PyPI: https://pypi.org/project/elvers/ -- Repository: https://github.com/quantbai/elvers +- **Design before implementing.** Think through module placement, API surface, and long-term implications before writing code. +- **Write cold, factual prose.** No emotional language ("non-negotiable", "must never"). No marketing ("world-class"). State facts and trade-offs. +- **Describe upstream behavior, do not complain.** Say "Polars applies ddof asymmetrically; Elvers isolates this by using ddof=1" -- not "Polars has a bug". +- **`__init__.py` is for imports/exports only.** New functionality goes in dedicated modules. --- -## 2. Architecture +## Architecture ``` elvers/ - __init__.py Package root. __version__ is the single source of truth for version. - core/factor.py Factor class (immutable container: [timestamp, symbol, factor]) - io/loader.py Data loading and panel balancing (supports daily/hourly/minutely via interval param) - io/panel.py Balanced panel container - ops/base.py Arithmetic operators (add, subtract, multiply, divide, reverse, densify, bucket) - ops/timeseries.py Time-series operators (ts_mean, ts_rank, ts_regression, etc.) - ops/cross_sectional.py Cross-sectional operators (rank, zscore, scale, signal, etc.) - ops/math.py Mathematical operators (log, sqrt, power, inverse, etc.) + __init__.py Imports/exports only. __version__ is the single version source. + _meta.py Diagnostics (show_versions) + core/factor.py Factor (immutable: [timestamp, symbol, factor]) + io/loader.py load(): read -> validate -> check intervals -> balance + io/panel.py Panel (balanced N symbols x T periods) + ops/base.py Arithmetic + ops/timeseries.py Time-series (per-symbol rolling window) + ops/cross_sectional.py Cross-sectional (across symbols per timestamp) + ops/math.py Math (element-wise) ops/neutralization.py Neutralization and group operators - ops/_dev.py Development/experimental operators (not exported, not production) + ops/_dev.py Experimental (not exported) ops/_validation.py Input validation helpers tests/ - conftest.py Test fixtures (make_ts, make_factor) - test_*.py One test file per operator module -OPERATORS.md Operator specification: numerical conventions, per-operator behavior, design rationale -CLAUDE.md Development standards (this file) -``` - ---- - -## 3. Git Workflow - -### Branch Strategy - -``` -main (protected) -- production-ready, always passes CI, tagged releases come from here - dev -- integration branch, CI must be green before merge to main - feature/XXX -- short-lived feature branches off dev - fix/XXX -- bug fix branches off dev -``` - -### Daily Development (Step by Step) - -```bash -# 1. Start from latest dev -git checkout dev -git pull origin dev - -# 2. Create feature/fix branch -git checkout -b fix/bug-name # or feature/feature-name - -# 3. Make changes, then verify locally -ruff check elvers/ # lint -pytest tests/ -v # all tests must pass - -# 4. Commit (see Commit Convention below) -git add # never use git add -A -git commit -m "fix(ops): description" - -# 5. Push and create PR -git push -u origin fix/bug-name -# -> Create PR on GitHub: fix/bug-name -> dev -# -> Fill in PR template (change type, numerical impact, testing) -# -> CI runs automatically (lint + pytest x4 Python versions) - -# 6. After review approval and CI green: squash merge on GitHub -# 7. Delete the feature branch -``` - -### Branch Rules - -- Never force push to main or dev -- Never commit directly to main (always through PR from dev) -- Feature branches should live no longer than 1-2 days -- Each PR should be a single logical change - ---- - -## 4. Coding Standards - -### 4.1 Numerical Correctness (Highest Priority) - -Operator behavior reference: [OPERATORS.md](OPERATORS.md). -The rules below are for writing new code: - -- All divisions MUST have explicit zero guards: - `pl.when(denom.abs() < 1e-10).then(None).otherwise(num / denom)` -- NEVER rely on the Factor constructor's implicit Inf-to-null conversion as normal logic flow -- Null semantics: null propagates naturally through Polars expressions. Boundary cases - (zero denominator, constant window, insufficient data) must be handled explicitly - -#### Design Decisions (rationale for current conventions) - -- **NaN/Inf unified to null**: eliminates the NaN-infection problem (`NaN + 1 = NaN`) - that silently corrupts downstream computations. The Factor constructor converts on - creation so the entire library operates on a single missing-value type. -- **ddof=0 everywhere**: rolling windows and cross-sections operate on the full observed - population, not a sample from a larger one. ddof=0 is semantically correct and avoids - n=1 division-by-zero (ddof=1 divides by n-1=0). -- **ts_corr/ts_autocorr use ddof=1 internally**: Polars `rolling_corr(ddof=0)` has a bug - where ddof only applies to the covariance numerator, not the variance denominator, - producing values outside [-1, 1]. Reported: https://github.com/pola-rs/polars/issues/16161. - Correlation is ddof-invariant (cancels in ratio), so ddof=1 output is correct. -- **rank range (0, 1] not [0, 1]**: a rank of 0 is ambiguous (could mean "missing" or - "lowest"). Strictly positive range ensures every ranked value is distinguishable from null. -- **Zero guard threshold 1e-10**: conservative enough to catch near-zero denominators, - small enough not to interfere with legitimate small values in financial data. -- **ts_product sign-magnitude decomposition**: naive `exp(sum(log(x)))` fails for negative - inputs because `log(x)` is undefined for x < 0. Separating sign and magnitude handles - this correctly. - -### 4.2 Operator Writing Rules - -- Time-series operators: always use `min_samples=window` for consistency -- Every new operator MUST include: functional test + null handling test + edge case test -- Docstrings MUST include: description, parameters, return type, null behavior, warmup period -- Naming: `ts_` prefix for time-series, `group_` prefix for group ops, no prefix for cross-sectional -- New operators must be added to `ops/__init__.py` exports and `__all__` - -### 4.3 Defensive Programming - -- Validate `window >= 1` and integer type at operator entry points -- Validate `0 <= q <= 1` for quantile parameters -- Validate Factor type on inputs (not raw DataFrame) -- Factor constructor validates required columns [timestamp, symbol, factor] -- Use validation helpers from `elvers/ops/_validation.py` - -### 4.4 Immutability - -- Factor objects are immutable after creation -- All operators return new Factor instances; never mutate inputs -- Factor._df and Factor._name are the only instance attributes (__slots__ enforced) - -### 4.5 Style - -- No emoji in code, comments, commit messages, or documentation -- English for all code, comments, and commit messages -- Ruff handles formatting and import sorting; run `ruff check --fix` before committing - ---- - -## 5. Testing Standards - -- Framework: pytest -- Config: pyproject.toml [tool.pytest.ini_options] -- Every public operator must have at least one test -- Numerical precision: use `pytest.approx(expected, abs=1e-10)` -- Test fixtures: `make_ts(vals, symbol)` and `make_factor(values, day)` from conftest.py -- Edge cases to cover: all-null input, single symbol, constant series (zero variance), window > data length -- Test file naming: `test_.py` matches `ops/.py` - ---- - -## 6. Commit Convention - -Format: `(): ` - -Types: -- fix: bug fix (state if it changes numerical output) -- feat: new operator or feature -- refactor: restructuring without behavior change -- test: test additions or modifications only -- perf: performance optimization -- docs: documentation only -- ci: CI/CD configuration -- release: version bump and CHANGELOG update (release commits only) - -Rules: -- Each commit is atomic: one logical change per commit -- Each commit must pass all tests (no "break then fix" sequences) -- Numerical changes must document the impact in the commit body with [NUMERICAL] tag -- No WIP, temp, or placeholder commits in main/dev branches - -Example: -``` -fix(ops): ts_product handle negative inputs - -log-sum-exp trick silently returned null for negative factor values. -Replaced with sign-magnitude decomposition: count negatives in window -to determine product sign, use exp(sum(log(abs(x)))) for magnitude. - -[NUMERICAL] ts_product output changes for any window containing -negative values. Previously returned null, now returns correct product. -``` - ---- - -## 7. Code Review Rules - -- When the team has multiple developers, enable "Require approvals" in branch protection -- Currently (single-developer mode): CI status checks are required, review approval is optional -- Reviewer must verify: - 1. Tests pass and cover the change - 2. Numerical correctness (manually verify at least one expected value) - 3. Zero-guard pattern followed for all divisions - 4. No implicit Inf-to-null reliance -- PRs that change numerical output must include before/after comparison in PR description -- PRs that add new operators must include the operator in `__all__` exports - ---- - -## 8. Versioning - -Format: MAJOR.MINOR.PATCH (Semantic Versioning) - -| Change Type | Version Bump | Example | -|-------------|-------------|---------| -| Bug fix, no numerical change | PATCH (0.0.x) | Add input validation | -| Bug fix that changes numerical output | MINOR (0.x.0) | Fix ts_product for negatives | -| New operator or feature | MINOR (0.x.0) | Add ts_entropy operator | -| Breaking API change | MAJOR (x.0.0) | Rename Factor to Signal | - -Version number lives in one place only: `elvers/__init__.py` -> `__version__`. -pyproject.toml reads it dynamically via `[tool.setuptools.dynamic]`. - ---- - -## 9. Release Process - -Releases are triggered by git tags. CI automatically: runs tests, builds package, -publishes to PyPI, and creates a GitHub Release with auto-generated release notes. - -### Step-by-Step Release - -```bash -# 1. Make sure dev is clean and CI passes -git checkout dev -git pull origin dev -pytest tests/ -v -ruff check elvers/ - -# 2. Update version number (single source of truth) -# Edit elvers/__init__.py: __version__ = "X.Y.Z" - -# 3. Update CHANGELOG.md -# Move items from [Unreleased] to [X.Y.Z] - YYYY-MM-DD - -# 4. Commit the release -git add elvers/__init__.py CHANGELOG.md -git commit -m "release: vX.Y.Z" -git push origin dev - -# 5. Create PR: dev -> main on GitHub -# Title: "release: vX.Y.Z" -# Wait for CI to pass and review approval -# Squash merge on GitHub - -# 6. Tag on main (after PR merged) -git checkout main -git pull origin main -git tag vX.Y.Z -git push origin vX.Y.Z - -# 7. Automated (triggered by tag push): -# - CI runs full test suite again -# - Builds sdist and wheel (python -m build) -# - Publishes to PyPI via Trusted Publisher -# - Creates GitHub Release with auto-generated notes -# - Attaches built artifacts to the Release -``` - -### What Happens Automatically - -When you push a tag like `vX.Y.Z`: - -1. `.github/workflows/publish.yml` triggers -2. Runs full test suite on Python 3.10-3.13 (safety net) -3. If tests pass: builds package, publishes to PyPI -4. Creates a GitHub Release page at github.com/quantbai/elvers/releases - with auto-generated release notes from commit messages -5. Users can now `pip install elvers==X.Y.Z # specific version` - -### What You See on GitHub After Release - -- **Releases** section (right sidebar) shows all published versions -- Each release has: tag name, release notes (from commits), downloadable artifacts -- **Packages** section: ignore this, we use PyPI not GitHub Packages - ---- - -## 10. Setup - -### Infrastructure (already configured) - -- PyPI Trusted Publisher: configured for quantbai/elvers -> publish.yml -- GitHub Branch Protection on main: require PR, require CI status checks -- GitHub Actions: ci.yml (push/PR) + publish.yml (tag-triggered release) - -### Local Development Setup (every new developer) - -```bash -git clone https://github.com/quantbai/elvers.git -cd elvers -git checkout dev -pip install -e ".[dev]" -pre-commit install + conftest.py Fixtures: make_ts, make_factor + test_*.py One file per ops module ``` ---- +Patterns: +- Time-series operators: `expr.over("symbol")` +- Cross-sectional operators: `expr.over("timestamp")` +- All operators: `Factor -> Factor`, stateless, immutable -## 11. Quick Reference - -```bash -pip install -e ".[dev]" # Setup -pre-commit install # Git hooks -pytest tests/ -v # Run all tests -pytest tests/test_timeseries.py::TestTsProduct -v # Single test class -ruff check elvers/ --fix # Lint + auto-fix -ruff format elvers/ # Format code -``` +Data loading pipeline (each function has one responsibility): +- `_read_source`: file I/O +- `_validate`: schema, types (timestamp must be pl.Date or pl.Datetime), duplicates, Inf detection +- `_check_intervals`: warn if timestamps are irregularly spaced +- `_balance`: union-based skeleton (not generated range), cross join symbols, left join data --- -## 12. Known Limitations / Future Work +## Known Limitations -- `trade_when` uses a sentinel value (-1.79e308) for exit signals; should be replaced with struct-based approach -- Input validation (`_validation.py`) is defined but not yet wired into all operator entry points -- No property-based testing (Hypothesis); would strengthen numerical correctness guarantees -- No performance benchmarks; should establish baseline timings for core operators -- `_dev.py` operators (hump, ts_arg_max, ts_arg_min) use Python callbacks and are not production-grade +- `trade_when`: sentinel value (-1.79e308) should become struct-based +- `_validation.py`: not wired into all operator entry points +- `_dev.py`: Python callbacks, not production-grade +- No property-based testing or performance benchmarks +- No static type checking (mypy/pyright) in CI; type hints are written but not enforced diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..a36e841 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,7 @@ +# Code of Conduct + +This project follows the [Contributor Covenant v2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct/). + +## Reporting + +Report unacceptable behavior via [GitHub private reporting](https://github.com/quantbai/elvers/security/advisories/new) or by contacting the maintainers through a private GitHub issue. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..32a9f8e --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,177 @@ +# Contributing to Elvers + +> **Pre-1.0 development.** API is stabilizing but may change between minor versions. +> Operator coverage and numerical conventions are under active refinement. + +--- + +## Getting Started + +```bash +# Fork on GitHub, then: +git clone https://github.com//elvers.git +cd elvers +git remote add upstream https://github.com/quantbai/elvers.git +git checkout dev +pip install -e ".[dev]" +pre-commit install +``` + +Verify: + +```bash +pytest tests/ -v +ruff check elvers/ +``` + +--- + +## Workflow + +### Branches + +``` +main (protected) -- tagged releases only + dev -- integration, CI must pass + feature/XXX -- new operators or features + fix/XXX -- bug fixes +``` + +All changes enter through pull requests to `dev`. + +### Development Cycle + +```bash +git checkout dev && git pull upstream dev # 1. sync +git checkout -b feature/ts-entropy # 2. branch +# ... implement ... +ruff check elvers/ && pytest tests/ -v # 3. verify +git add # 4. stage (never git add -A) +git commit -m "feat(ops): add ts_entropy" # 5. commit +git push -u origin feature/ts-entropy # 6. push +# 7. open PR -> dev +``` + +--- + +## Adding a New Operator + +### 1. Choose the Module + +| Prefix | File | Scope | +|--------|------|-------| +| `ts_` | `ops/timeseries.py` | Per-symbol rolling window | +| (none) | `ops/cross_sectional.py` | Across symbols per timestamp | +| `group_` | `ops/neutralization.py` | Within groups per timestamp | +| (none) | `ops/math.py` | Element-wise math | +| (none) | `ops/base.py` | Arithmetic and structural | + +### 2. Implement + +Adhere to the numerical invariants below and follow existing patterns in the target module. + +### 3. Export + +Add to `elvers/ops/__init__.py`: import line and `__all__` list. + +### 4. Test + +Add tests in `tests/test_.py`: + +- **Correctness**: verify against expected values (`pytest.approx`). For complex operators (regression, covariance, decay), cross-validate against NumPy or SciPy on randomized inputs. +- **Null propagation**: null inputs produce correct null outputs +- **Edge cases**: constant series, all-null, window > data length, zero denominators + +### 5. Document + +Add entry to [OPERATORS.md](OPERATORS.md). + +### 6. Pre-PR Checklist + +- [ ] `ruff check elvers/` passes +- [ ] `pytest tests/ -v` passes (full suite) +- [ ] Operator in `__init__.py` exports and `__all__` +- [ ] Docstring: description, parameters, return type, null behavior, warmup +- [ ] Divisions handle zero denominators (exact zero check or Inf → null via Factor) +- [ ] No Python loops in computation +- [ ] Tests cover: correctness, null, edge cases +- [ ] `OPERATORS.md` updated + +--- + +## Numerical Invariants + +Elvers maintains the following invariants. + +- NaN and Inf are unified to null. Single missing-value type throughout. +- Pure division (divide, inverse): no guard. Inf → null via Factor constructor. +- Population statistics: ddof=0 for std, variance, covariance. +- Rank range: (0, 1]. Ties: `average`. Zero excluded. +- Rolling warmup: first `window - 1` values per symbol are null. + +Full conventions: [OPERATORS.md](OPERATORS.md). + +### Design Rationale + +| Decision | Rationale | Trade-off | +|----------|-----------|-----------| +| NaN/Inf -> null | Eliminates NaN infection (`NaN + 1 = NaN`). One missing-value type simplifies null-propagation logic. | Loses the distinction between missing data and computation overflow. Acceptable for daily/hourly factor research where Inf indicates a defect, not a signal. May need revisiting for tick-level microstructure data. | +| ddof=0 | Rolling windows observe the full population within the window, not a sample. Avoids n=1 division-by-zero. | ddof=1 would be appropriate for random samples. Deterministic lookbacks are not random samples. | +| Rank (0, 1] | Zero is ambiguous (missing or lowest?). Strictly positive values are distinguishable from null. | Downstream code cannot assume [0, 1] range. | +| Division by zero | Pure divisions (divide, inverse) have no guard; Inf → null via Factor constructor. Statistical operators (zscore, scale, etc.) check for exact zero denominators and return semantic defaults (e.g., 0.0 for constant series). | Near-zero denominators produce large but finite values. Use winsorize/truncate to handle outliers. | +| ts_corr uses ddof=1 internally | Polars `rolling_corr(ddof=0)` applies ddof only to the covariance numerator, not the variance denominator, producing values outside [-1, 1] ([polars#16161](https://github.com/pola-rs/polars/issues/16161)). Elvers isolates this by using ddof=1, which is mathematically equivalent for correlation (ddof cancels in the ratio). | Will align to ddof=0 when the upstream issue is resolved. | + +--- + +## Style + +- Ruff for formatting and linting (line-length = 120) +- English for all code, comments, commits, and documentation +- No emoji + +### Commits + +Format: `(): ` + +| Type | Use | +|------|-----| +| feat | New operator or feature | +| fix | Bug fix | +| refactor | No behavior change | +| test | Tests only | +| perf | Performance | +| docs | Documentation | +| ci | CI/CD | + +Numerical changes require a `[NUMERICAL]` tag in the commit body with impact description. + +--- + +## Pull Requests + +- One PR = one logical change +- Target: `dev` branch +- CI must pass on Python 3.10, 3.11, 3.12, 3.13 + +Numerical output changes require before/after comparison in the PR description. + +### Review Criteria + +1. Tests pass and cover the change +2. Numerical outputs cross-validated (simple operators: known values; complex operators: NumPy/SciPy oracle) +3. Zero-guard pattern on all divisions +4. No implicit Inf-to-null reliance +5. Operator exported and documented + +--- + +## Versioning + +MAJOR.MINOR.PATCH ([SemVer](https://semver.org/)). + +| Change | Bump | +|--------|------| +| Bug fix, no numerical change | PATCH | +| Numerical output change | MINOR | +| New operator | MINOR | +| Breaking API change | MAJOR | diff --git a/OPERATORS.md b/OPERATORS.md index 42d7a85..03b84fa 100644 --- a/OPERATORS.md +++ b/OPERATORS.md @@ -10,10 +10,10 @@ | --- | --- | | Missing values | NaN and Inf are converted to null on Factor creation. Single missing-value type (null) throughout. | | Null arithmetic | `5.0 + null = null`. Use `filter=True` on add/subtract/multiply to treat null as identity (0 for +/-, 1 for *). | -| Division by zero | All divisions guarded at `abs(denom) < 1e-10`, returning null. | +| Division by zero | Pure divisions (divide, inverse): no guard, Inf → null via Factor. Statistical/regression operators: exact zero check on denominator, returns semantic default (0.0 for zscore, null for CV, etc.). | | Std / Variance | ddof=0 (population) for all std, variance, covariance, zscore, normalize, winsorize. | | Correlation | ddof-invariant. `ts_covariance(x,y,w) / (ts_std_dev(x,w) * ts_std_dev(y,w)) == ts_corr(x,y,w)`. | -| Rank | Range (0, 1]. Does not include zero. Ties: `average` method. Null excluded. | +| Rank | Range (0, 1]. Does not include zero. Ties: `average` method. Null excluded. Applies to: `rank`, `ts_rank`, `group_rank`, `quantile`. Exception: `densify` uses `dense` (consecutive integers). | | Rolling warmup | All `ts_*` operators: first `window-1` values per symbol are null (`min_samples=window`). | --- @@ -40,7 +40,7 @@ Element-wise multiplication. `filter=True`: null treated as 1. ### `divide(a, b)` -Element-wise division. Returns null where `abs(b) < 1e-10`. +Element-wise division. Division by zero → Inf → null via Factor. ### `reverse(x)` @@ -110,7 +110,7 @@ Rolling excess kurtosis. Fisher definition (normal = 0). ### `ts_zscore(x, window)` -`(x - rolling_mean) / rolling_std`. ddof=0. Returns 0 if `std < 1e-10`. +`(x - rolling_mean) / rolling_std`. ddof=0. Returns 0 if `std = 0`. ### `ts_corr(x, y, window)` @@ -152,7 +152,7 @@ Periods since last value change. Returns 0 on change. No warmup. ### `ts_scale(x, window)` -Min-max normalization: `(x - min) / (max - min)`. Range: [0, 1]. Returns 0 if range < 1e-10. +Min-max normalization: `(x - min) / (max - min)`. Range: [0, 1]. Returns 0 if range = 0. ### `ts_percentile(x, window, q)` @@ -164,7 +164,7 @@ Rolling percentile at quantile `q` in [0, 1]. ### `ts_cv(x, window)` -Coefficient of variation: `std / abs(mean)`. ddof=0. Returns null if `abs(mean) < 1e-10`. +Coefficient of variation: `std / abs(mean)`. ddof=0. Returns null if mean = 0. ### `ts_autocorr(x, window, lag=1)` @@ -188,7 +188,7 @@ Most recent value in lookback that differs from current. Null if none. ### `inst_tvr(x, window)` -Instrument turnover: `sum(abs(delta)) / sum(abs(x))`. Returns 0 if denom < 1e-10. +Instrument turnover: `sum(abs(delta)) / sum(abs(x))`. Returns 0 if denom = 0. ### `ts_delta_limit(x, y, limit_volume=0.1)` @@ -211,7 +211,7 @@ Rolling OLS regression. | 8 | Std error of beta | | 9 | Std error of alpha | -Zero guard on `sum(x^2) < 1e-10` and `SST < 1e-10`. +Degenerate cases: beta = 0 if `var(x) = 0`, R² = 1 if `SST = 0`. ### `trade_when(trigger, alpha, exit_cond)` @@ -231,7 +231,7 @@ Example: `(4, 3, 6, 10, 2)` -> `(0.6, 0.4, 0.8, 1.0, 0.2)` ### `zscore(x)` -`(x - mean) / std`. ddof=0. Returns 0 if `std < 1e-10`. +`(x - mean) / std`. ddof=0. Returns 0 if `std = 0`. ### `mean(x)` @@ -243,7 +243,7 @@ Cross-sectional median, broadcast to all symbols. ### `scale(x, target=1.0, longscale=0.0, shortscale=0.0)` -Scale so `sum(abs(x)) = target`. When `longscale`/`shortscale` are non-zero, scale long and short legs separately. Returns 0 if sum < 1e-10. +Scale so `sum(abs(x)) = target`. When `longscale`/`shortscale` are non-zero, scale long and short legs separately. Returns 0 if sum = 0. ### `normalize(x, use_std=False, limit=0.0)` @@ -255,7 +255,7 @@ Rank then inverse CDF. Drivers: `gaussian`, `uniform`, `cauchy`. Acklam approxim ### `signal(x)` -Zero-mean, unit-absolute-sum normalization. Returns 0 if `abs_sum < 1e-10` or `count < 2`. +Zero-mean, unit-absolute-sum normalization. Returns 0 if `abs_sum = 0` or `count < 2`. ### `winsorize(x, std=4)` @@ -279,11 +279,11 @@ Null values below `minimum`. ### `vector_neut(x, y)` -Orthogonal residual: `x - proj_y(x)`. Returns `x` if `dot(y,y) < 1e-10`. +Orthogonal residual: `x - proj_y(x)`. Returns `x` if `dot(y,y) = 0`. ### `regression_neut(y, x)` -OLS residual: `y - (alpha + beta * x)`. beta = 0 if `var(x) < 1e-10`. +OLS residual: `y - (alpha + beta * x)`. beta = 0 if `var(x) = 0`. ### `group_neutralize(x, group)` @@ -295,19 +295,19 @@ Rank within group. (0, 1], `average`, null excluded. ### `group_zscore(x, group)` -Z-score within group. ddof=0. Returns 0 if group std < 1e-10. +Z-score within group. ddof=0. Returns 0 if group std = 0. ### `group_scale(x, group)` -Min-max within group to [0, 1]. Returns 0 if range < 1e-10. +Min-max within group to [0, 1]. Returns 0 if range = 0. ### `group_normalize(x, group, target=1)` -Scale within group so `sum(abs(x)) = target`. Returns 0 if sum < 1e-10. +Scale within group so `sum(abs(x)) = target`. Returns 0 if sum = 0. ### `group_mean(x, group, weight=None)` -Group mean. With weight: weighted mean, falls back to unweighted if `sum(weight) < 1e-10`. +Group mean. With weight: weighted mean, falls back to unweighted if `sum(weight) = 0`. ### `group_median(x, group)` @@ -343,7 +343,7 @@ Square root. Null for negative inputs. For sign-preserving: `signed_power(x, 0.5 ### `inverse(x)` -`1 / x`. Returns null where `abs(x) < 1e-10`. +`1 / x`. Division by zero → Inf → null via Factor. ### `s_log_1p(x)` diff --git a/README.md b/README.md index 4a89603..c6cdaf7 100644 --- a/README.md +++ b/README.md @@ -12,10 +12,15 @@ Polars-native factor computation engine for quantitative research. All operators execute as Rust-backed Polars expressions with no Python loops in the hot path. -## Core Abstractions +> **Pre-1.0 development.** API is stabilizing but may have breaking changes between minor versions. See [CHANGELOG.md](CHANGELOG.md). -- **`Panel`** -- Balanced `(timestamp, symbol)` container with strict alignment guarantees. Prevents look-ahead bias by construction. -- **`Factor`** -- Immutable signal vector. Every operator takes `Factor` and returns `Factor` with eager evaluation. +## Design + +Elvers enforces three structural invariants: + +- **Immutability.** Every `Factor` is frozen after creation. Operators return new instances; no in-place mutation. +- **Balanced alignment.** `Panel` guarantees every symbol shares the same timestamp index. Missing values are explicit nulls, not silent omissions. +- **Single null semantics.** NaN and Inf are converted to null on `Factor` creation. One missing-value type eliminates NaN-infection (`NaN + 1 = NaN`). ## Installation @@ -25,10 +30,14 @@ pip install elvers ## Usage +Elvers accepts any Polars DataFrame with `timestamp` and `symbol` columns: + ```python from elvers import load, ts_rank, ts_regression, zscore, signal -panel = load() # built-in sample data (crypto 1d OHLCV) +panel = load("ohlcv.parquet") # CSV, Parquet, or pl.DataFrame +panel = load() # built-in sample: crypto daily OHLCV + close, volume = panel["close"], panel["volume"] momentum = ts_rank(close, 20) @@ -37,17 +46,13 @@ beta_resid = ts_regression(close, volume, window=60, rettype=0) alpha = signal(vol_adj) ``` -Sub-daily data is supported via the `interval` parameter: - -```python -panel = load("hourly.parquet", interval="1h") -``` +Any time frequency is supported. The panel skeleton is built from the union of timestamps present in the data. ## Operators 72 operators. All accept and return `Factor`. -**Time-Series** -- rolling window per symbol: +**Time-Series** -- per-symbol rolling window: `ts_delay` `ts_delta` `ts_mean` `ts_sum` `ts_std_dev` `ts_min` `ts_max` `ts_median` `ts_rank` `ts_skewness` `ts_kurtosis` `ts_zscore` `ts_corr` `ts_covariance` `ts_product` `ts_step` `ts_decay_linear` `ts_decay_exp_window` `days_from_last_change` `ts_av_diff` `ts_scale` `ts_percentile` `ts_quantile` `ts_cv` `ts_autocorr` `ts_count_nans` `ts_backfill` `kth_element` `last_diff_value` `inst_tvr` `ts_delta_limit` `ts_regression` `trade_when` @@ -55,11 +60,11 @@ panel = load("hourly.parquet", interval="1h") `rank` `zscore` `mean` `median` `scale` `normalize` `quantile` `signal` `winsorize` `truncate` `left_tail` `right_tail` -**Neutralization and Group** -- sector/industry neutralization: +**Neutralization and Group** -- sector/industry control: `vector_neut` `regression_neut` `group_neutralize` `group_rank` `group_zscore` `group_scale` `group_normalize` `group_mean` `group_median` `group_backfill` -**Math**: +**Math** -- element-wise transforms: `log` `sqrt` `sign` `power` `signed_power` `inverse` `s_log_1p` `maximum` `minimum` `where` @@ -67,7 +72,7 @@ panel = load("hourly.parquet", interval="1h") `add` `subtract` `multiply` `divide` `reverse` `densify` `bucket` and standard operators (`+` `-` `*` `/` `**` `abs`) -Full specifications and numerical conventions: **[OPERATORS.md](OPERATORS.md)** +Full specifications: **[OPERATORS.md](OPERATORS.md)** ## Development @@ -77,4 +82,4 @@ pytest tests/ -v ruff check elvers/ ``` -See [CLAUDE.md](CLAUDE.md) for full development standards. +See [CONTRIBUTING.md](CONTRIBUTING.md) for the contributor guide. diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..3b01087 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,20 @@ +# Security Policy + +## Supported Versions + +| Version | Supported | +|---------|-----------| +| 0.3.x | Yes | +| < 0.3 | No | + +## Reporting a Vulnerability + +**Do not open a public issue for security vulnerabilities.** + +Report via [GitHub Private Vulnerability Reporting](https://github.com/quantbai/elvers/security/advisories/new). + +You will receive a response within 72 hours. + +## Scope + +Elvers is a computation library with no network access, no file system writes, and no user authentication. The primary security concern is **numerical correctness** -- if an operator produces silently incorrect results, report it as a high-priority bug via a public issue. diff --git a/elvers/__init__.py b/elvers/__init__.py index b2eac87..47adbef 100644 --- a/elvers/__init__.py +++ b/elvers/__init__.py @@ -7,9 +7,10 @@ __version__ = "0.3.0" __author__ = "quantbai" +from ._meta import show_versions from .core import Factor from .io import Panel, load from .ops import * # noqa: F401,F403 from .ops import __all__ as _ops_all -__all__ = ["__version__", "__author__", "Factor", "load", "Panel"] + list(_ops_all) +__all__ = ["__version__", "__author__", "Factor", "load", "Panel", "show_versions"] + list(_ops_all) diff --git a/elvers/_meta.py b/elvers/_meta.py new file mode 100644 index 0000000..944bdb4 --- /dev/null +++ b/elvers/_meta.py @@ -0,0 +1,33 @@ +"""Package diagnostics for bug reports.""" + +from __future__ import annotations + +import platform +import sys + +import polars as pl + +from . import __version__ + + +def show_versions() -> str: + """Return version and environment info as a formatted string. + + Also prints to stdout for convenience. + + Returns + ------- + str + Multi-line version info block, suitable for pasting into bug reports. + """ + lines = [ + "--------Elvers---------", + f"Elvers: {__version__}", + f"Polars: {pl.__version__}", + f"Python: {sys.version}", + f"Platform: {platform.platform()}", + f"Architecture: {platform.machine()}", + ] + text = "\n".join(lines) + print(text) + return text diff --git a/elvers/io/loader.py b/elvers/io/loader.py index c6c91f9..11e3b2f 100644 --- a/elvers/io/loader.py +++ b/elvers/io/loader.py @@ -14,7 +14,6 @@ def load( source: str | Path | pl.DataFrame | None = None, format: str = "parquet", - interval: str = "1d", ) -> Panel: """Load data into a balanced Panel. @@ -22,28 +21,26 @@ def load( fully balanced (N symbols x T periods) long-format panel. Missing factor values are filled with null. + The time skeleton is the union of all timestamps present in the data. + Every symbol gets every timestamp; missing values are null. + Parameters ---------- source : str, Path, pl.DataFrame, or None Data source. None loads the built-in sample dataset. format : str File format for built-in data ("csv" or "parquet"). - interval : str - Time interval for the balanced panel skeleton (e.g., "1d", "1h", "5m"). - Must match the frequency of the input data. - - When called with no arguments, loads the built-in sample dataset - (daily crypto OHLCV) bundled with the package. """ if source is None: if format not in ["csv", "parquet"]: raise ValueError("format must be 'csv' or 'parquet'") ref = resources.files("elvers.data").joinpath(f"crypto_1d.{format}") with resources.as_file(ref) as p: - return load(p, interval=interval) + return load(p) df = _read_source(source) df = _validate(df) - balanced = _balance(df, interval=interval) + _check_intervals(df) + balanced = _balance(df) return Panel(balanced) @@ -64,6 +61,13 @@ def _validate(df: pl.DataFrame) -> pl.DataFrame: if missing: raise ValueError(f"Missing required columns: {missing}") + ts_dtype = df.schema["timestamp"] + if ts_dtype not in (pl.Date, pl.Datetime, pl.Datetime("ms"), pl.Datetime("us"), pl.Datetime("ns")): + raise TypeError( + f"Column 'timestamp' must be pl.Date or pl.Datetime, got {ts_dtype}. " + f"Cast with pl.col('timestamp').str.to_date() or .str.to_datetime()." + ) + n_total = len(df) n_unique = df.select(pl.struct("timestamp", "symbol").n_unique())[0, 0] if n_unique < n_total: @@ -90,23 +94,36 @@ def _validate(df: pl.DataFrame) -> pl.DataFrame: return df -def _balance(df: pl.DataFrame, interval: str = "1d") -> pl.DataFrame: - """Expand to full date_range x symbols skeleton, fill missing with null.""" - start = df["timestamp"].min() - end = df["timestamp"].max() - symbols = sorted(df["symbol"].unique().to_list()) - ts_type = df.schema["timestamp"] +def _check_intervals(df: pl.DataFrame) -> None: + """Warn if timestamps are not equally spaced.""" + ts = df["timestamp"].unique().sort() + if len(ts) < 3: + return - skeleton = ( - pl.datetime_range(start=start, end=end, interval=interval, eager=True) - .alias("timestamp") - .to_frame() - ) + diffs = ts.diff().drop_nulls() + mode_diff = diffs.mode().sort()[0] + n_irregular = (diffs != mode_diff).sum() + + if n_irregular > 0: + pct = n_irregular / len(diffs) * 100 + warnings.warn( + f"Irregular timestamp intervals detected: {n_irregular} gaps " + f"({pct:.1f}%) differ from the most common interval ({mode_diff}). " + f"This may indicate missing data or mixed frequencies.", + stacklevel=3, + ) - if skeleton.schema["timestamp"] != ts_type: - skeleton = skeleton.with_columns(pl.col("timestamp").cast(ts_type)) - skeleton = skeleton.join(pl.DataFrame({"symbol": symbols}), how="cross") +def _balance(df: pl.DataFrame) -> pl.DataFrame: + """Expand to (all timestamps x all symbols), fill missing with null. + + The skeleton is the union of all timestamps present in the data, + not a generated range. This naturally handles weekends, holidays, + and irregular trading calendars. + """ + timestamps = df.select("timestamp").unique().sort("timestamp") + symbols = pl.DataFrame({"symbol": sorted(df["symbol"].unique().to_list())}) + skeleton = timestamps.join(symbols, how="cross") n_expected = len(skeleton) n_original = len(df) diff --git a/elvers/ops/base.py b/elvers/ops/base.py index 54e8a90..ead4d0f 100644 --- a/elvers/ops/base.py +++ b/elvers/ops/base.py @@ -46,10 +46,7 @@ def multiply(a: Factor, b: Factor | int | float, filter: bool = False) -> Factor def divide(a: Factor, b: Factor | int | float) -> Factor: - """Element-wise division with zero-denominator protection. - - Returns null where abs(b) < 1e-10. - """ + """Element-wise division. Division by zero produces Inf, converted to null by Factor.""" if isinstance(b, Factor): merged = a.df.rename({"factor": "_a"}).join( b.df.select(["timestamp", "symbol", pl.col("factor").alias("_b")]), @@ -57,17 +54,9 @@ def divide(a: Factor, b: Factor | int | float) -> Factor: how="inner", ) result = merged.with_columns( - pl.when(pl.col("_b").abs() < 1e-10) - .then(None) - .otherwise(pl.col("_a") / pl.col("_b")) - .alias("factor") + (pl.col("_a") / pl.col("_b")).alias("factor") ).select(["timestamp", "symbol", "factor"]) return Factor(result, f"({a.name}/{b.name})") - if isinstance(b, (int, float)) and abs(b) < 1e-10: - return Factor( - a.df.with_columns(pl.lit(None).cast(pl.Float64).alias("factor")), - f"({a.name}/{b})", - ) return a / b diff --git a/elvers/ops/cross_sectional.py b/elvers/ops/cross_sectional.py index 2369654..ccaa6ee 100644 --- a/elvers/ops/cross_sectional.py +++ b/elvers/ops/cross_sectional.py @@ -29,7 +29,7 @@ def zscore(f: Factor) -> Factor: col = pl.col("factor") std = col.std(ddof=0) expr = pl.when(col.is_null()).then(None).when( - std < 1e-10 + std == 0 ).then(0.0).otherwise( (col - col.mean()) / std ) @@ -62,8 +62,8 @@ def scale(f: Factor, target: float = 1.0, longscale: float = 0.0, shortscale: fl if longscale > 0 or shortscale > 0: long_sum = col.filter(col > 0).sum() short_sum = col.filter(col < 0).abs().sum() - long_factor = pl.when(long_sum < 1e-10).then(0.0).otherwise(pl.lit(longscale) / long_sum) - short_factor = pl.when(short_sum < 1e-10).then(0.0).otherwise(pl.lit(shortscale) / short_sum) + long_factor = pl.when(long_sum == 0).then(0.0).otherwise(pl.lit(longscale) / long_sum) + short_factor = pl.when(short_sum == 0).then(0.0).otherwise(pl.lit(shortscale) / short_sum) expr = pl.when(col.is_null()).then(None).when(col > 0).then( col * long_factor ).when(col < 0).then( @@ -72,7 +72,7 @@ def scale(f: Factor, target: float = 1.0, longscale: float = 0.0, shortscale: fl return _cs_op(f, expr, f"scale({f.name},long={longscale},short={shortscale})") abs_sum = col.abs().sum() expr = pl.when(col.is_null()).then(None).when( - abs_sum < 1e-10 + abs_sum == 0 ).then(0.0).otherwise( col / abs_sum * target ) @@ -85,7 +85,7 @@ def normalize(f: Factor, use_std: bool = False, limit: float = 0.0) -> Factor: demeaned = col - col.mean() if use_std: std = col.std(ddof=0) - result = pl.when(col.is_null()).then(None).when(std < 1e-10).then(0.0).otherwise(demeaned / std) + result = pl.when(col.is_null()).then(None).when(std == 0).then(0.0).otherwise(demeaned / std) else: result = pl.when(col.is_null()).then(None).otherwise(demeaned) if limit > 0: @@ -162,7 +162,7 @@ def signal(f: Factor) -> Factor: abs_sum = demeaned.abs().sum() expr = pl.when( - (valid_count < 2) | (abs_sum < 1e-10) | col.is_null() + (valid_count < 2) | (abs_sum == 0) | col.is_null() ).then(pl.lit(0.0)).otherwise( demeaned / abs_sum ) diff --git a/elvers/ops/math.py b/elvers/ops/math.py index cefdc0b..e6624e4 100644 --- a/elvers/ops/math.py +++ b/elvers/ops/math.py @@ -51,10 +51,9 @@ def signed_power(base: Factor, exp: Factor | int | float) -> Factor: def inverse(f: Factor) -> Factor: - """Reciprocal (1/x). Returns null where abs(x) < 1e-10.""" + """Reciprocal (1/x). Division by zero produces Inf, converted to null by Factor.""" col = pl.col("factor") - expr = pl.when(col.abs() < 1e-10).then(None).otherwise(pl.lit(1.0) / col) - return _unary(f, expr, f"inverse({f.name})") + return _unary(f, pl.lit(1.0) / col, f"inverse({f.name})") def s_log_1p(f: Factor) -> Factor: diff --git a/elvers/ops/neutralization.py b/elvers/ops/neutralization.py index fa6bb14..0beb410 100644 --- a/elvers/ops/neutralization.py +++ b/elvers/ops/neutralization.py @@ -17,7 +17,7 @@ def vector_neut(x: Factor, y: Factor) -> Factor: (pl.col("_x") * pl.col("_y")).sum().over("timestamp").alias("_xy"), (pl.col("_y") * pl.col("_y")).sum().over("timestamp").alias("_yy"), ]).with_columns( - pl.when(pl.col("_yy").abs() < 1e-10).then(pl.col("_x")).otherwise( + pl.when(pl.col("_yy").abs() == 0).then(pl.col("_x")).otherwise( pl.col("_x") - (pl.col("_xy") / pl.col("_yy")) * pl.col("_y") ).alias("factor") ).select(["timestamp", "symbol", "factor"]) @@ -39,7 +39,7 @@ def regression_neut(y: Factor, x: Factor) -> Factor: ]).with_columns( (pl.col("_n") * pl.col("_sxx") - pl.col("_sx") * pl.col("_sx")).alias("_denom"), ).with_columns( - pl.when(pl.col("_denom").abs() < 1e-10).then(0.0).otherwise( + pl.when(pl.col("_denom").abs() == 0).then(0.0).otherwise( (pl.col("_n") * pl.col("_sxy") - pl.col("_sx") * pl.col("_sy")) / pl.col("_denom") ).alias("_beta"), ).with_columns( @@ -84,7 +84,7 @@ def group_zscore(f: Factor, group: Factor) -> Factor: col = pl.col("factor") grp_std = col.std(ddof=0).over(["timestamp", "_group"]) result = merged.with_columns( - pl.when(grp_std < 1e-10).then(0.0).otherwise( + pl.when(grp_std == 0).then(0.0).otherwise( (col - col.mean().over(["timestamp", "_group"])) / grp_std ).alias("factor") ).select(["timestamp", "symbol", "factor"]) @@ -102,7 +102,7 @@ def group_scale(f: Factor, group: Factor) -> Factor: mx = col.max().over(["timestamp", "_group"]) rng = mx - mn result = merged.with_columns( - pl.when(rng < 1e-10).then(0.0).otherwise((col - mn) / rng).alias("factor") + pl.when(rng == 0).then(0.0).otherwise((col - mn) / rng).alias("factor") ).select(["timestamp", "symbol", "factor"]) return Factor(result, f"group_scale({f.name},{group.name})") @@ -116,7 +116,7 @@ def group_normalize(f: Factor, group: Factor, target: float = 1.0) -> Factor: col = pl.col("factor") grp_abs_sum = col.abs().sum().over(["timestamp", "_group"]) result = merged.with_columns( - pl.when(grp_abs_sum < 1e-10).then(0.0).otherwise(col / grp_abs_sum * target).alias("factor") + pl.when(grp_abs_sum == 0).then(0.0).otherwise(col / grp_abs_sum * target).alias("factor") ).select(["timestamp", "symbol", "factor"]) return Factor(result, f"group_normalize({f.name},{group.name},{target})") @@ -134,7 +134,7 @@ def group_mean(f: Factor, group: Factor, weight: Factor | None = None) -> Factor ) w_sum = pl.col("_w").sum().over(["timestamp", "_group"]) result = merged.with_columns( - pl.when(w_sum.abs() < 1e-10).then(pl.col("factor").mean().over(["timestamp", "_group"])).otherwise( + pl.when(w_sum.abs() == 0).then(pl.col("factor").mean().over(["timestamp", "_group"])).otherwise( (pl.col("factor") * pl.col("_w")).sum().over(["timestamp", "_group"]) / w_sum ).alias("factor") ).select(["timestamp", "symbol", "factor"]) diff --git a/elvers/ops/timeseries.py b/elvers/ops/timeseries.py index 9822ae9..5cbd389 100644 --- a/elvers/ops/timeseries.py +++ b/elvers/ops/timeseries.py @@ -80,7 +80,7 @@ def ts_zscore(f: Factor, window: int) -> Factor: col = pl.col("factor") mean = col.rolling_mean(window, min_samples=window) std = col.rolling_std(window, min_samples=window, ddof=0) - expr = pl.when(std < 1e-10).then(0.0).otherwise((col - mean) / std) + expr = pl.when(std == 0).then(0.0).otherwise((col - mean) / std) return _ts_op(f, expr, f"ts_zscore({f.name},{window})") @@ -162,7 +162,7 @@ def kth_element(f: Factor, window: int, k: int = 1, ignore: str = "NaN") -> Fact val = col.shift(i) is_valid = val.is_not_null() if ignore_zero: - is_valid = is_valid & (val.abs() > 1e-10) + is_valid = is_valid & (val != 0) new_count = count + is_valid.cast(pl.Int32) accum = pl.when((new_count >= k) & (count < k)).then(val).otherwise(accum) count = new_count @@ -227,7 +227,7 @@ def ts_scale(f: Factor, window: int, constant: float = 0) -> Factor: mn = col.rolling_min(window, min_samples=window) mx = col.rolling_max(window, min_samples=window) rng = mx - mn - expr = pl.when(rng < 1e-10).then(0.0).otherwise((col - mn) / rng) + constant + expr = pl.when(rng == 0).then(0.0).otherwise((col - mn) / rng) + constant return _ts_op(f, expr, f"ts_scale({f.name},{window},{constant})") @@ -264,7 +264,7 @@ def ts_cv(f: Factor, window: int) -> Factor: col = pl.col("factor") std = col.rolling_std(window, min_samples=window, ddof=0) abs_mean = col.rolling_mean(window, min_samples=window).abs() - expr = pl.when(abs_mean < 1e-10).then(None).otherwise(std / abs_mean) + expr = pl.when(abs_mean == 0).then(None).otherwise(std / abs_mean) return _ts_op(f, expr, f"ts_cv({f.name},{window})") @@ -309,7 +309,7 @@ def inst_tvr(f: Factor, window: int) -> Factor: abs_delta = (col - col.shift(1)).abs() numerator = abs_delta.rolling_sum(window - 1, min_samples=window - 1) denominator = col.abs().rolling_sum(window - 1, min_samples=window - 1) - expr = pl.when(denominator < 1e-10).then(0.0).otherwise(numerator / denominator) + expr = pl.when(denominator == 0).then(0.0).otherwise(numerator / denominator) return _ts_op(f, expr, f"inst_tvr({f.name},{window})") @@ -352,7 +352,7 @@ def ts_regression(y: Factor, x: Factor, window: int, lag: int = 0, rettype: int ]) denom = pl.lit(n) * pl.col("_sxx") - pl.col("_sx") * pl.col("_sx") - beta = pl.when(denom.abs() < 1e-10).then(0.0).otherwise( + beta = pl.when(denom == 0).then(0.0).otherwise( (pl.lit(n) * pl.col("_sxy") - pl.col("_sx") * pl.col("_sy")) / denom ) alpha_expr = (pl.col("_sy") - beta * pl.col("_sx")) / pl.lit(n) @@ -386,27 +386,27 @@ def ts_regression(y: Factor, x: Factor, window: int, lag: int = 0, rettype: int result = merged.with_columns(pl.col("_sst").alias("factor")) elif rettype == 6: result = merged.with_columns( - pl.when(pl.col("_sst").abs() < 1e-10).then(1.0).otherwise( + pl.when(pl.col("_sst") == 0).then(1.0).otherwise( pl.lit(1.0) - pl.col("_sse") / pl.col("_sst") ).alias("factor") ) elif rettype == 7: result = merged.with_columns( - pl.when(pl.lit(n - 2) < 1e-10).then(None).otherwise( + pl.when(pl.lit(n - 2) == 0).then(None).otherwise( pl.col("_sse") / pl.lit(n - 2) ).alias("factor") ) elif rettype == 8: mse = pl.col("_sse") / pl.lit(n - 2) result = merged.with_columns( - pl.when(pl.col("_denom").abs() < 1e-10).then(None).otherwise( + pl.when(pl.col("_denom") == 0).then(None).otherwise( (mse * pl.lit(n) / pl.col("_denom")).sqrt() ).alias("factor") ) elif rettype == 9: mse = pl.col("_sse") / pl.lit(n - 2) result = merged.with_columns( - pl.when(pl.col("_denom").abs() < 1e-10).then(None).otherwise( + pl.when(pl.col("_denom") == 0).then(None).otherwise( (mse * pl.col("_sxx") / (pl.lit(n) * pl.col("_denom"))).sqrt() ).alias("factor") ) diff --git a/tests/test_cross_sectional.py b/tests/test_cross_sectional.py index adf5207..b906450 100644 --- a/tests/test_cross_sectional.py +++ b/tests/test_cross_sectional.py @@ -74,7 +74,7 @@ def test_demean(self): def test_use_std_gives_unit_variance(self): data = {chr(65+i): float(i*10+5) for i in range(20)} out = normalize(make_factor(data), use_std=True).df["factor"].to_list() - assert sum(out)/len(out) == pytest.approx(0.0, abs=1e-10) + assert sum(out)/len(out) == pytest.approx(0.0) assert sum(v*v for v in out)/len(out) == pytest.approx(1.0, rel=1e-6) def test_limit_clips(self): @@ -101,12 +101,12 @@ def test_null_preserved(self): class TestSignal: def test_net_zero_and_unit_abs_sum(self): vals = signal(make_factor({"A": 1.0, "B": 2.0, "C": 3.0, "D": 4.0})).df["factor"].to_list() - assert sum(vals) == pytest.approx(0.0, abs=1e-10) + assert sum(vals) == pytest.approx(0.0) assert sum(abs(v) for v in vals) == pytest.approx(1.0, rel=1e-6) def test_null_gets_zero(self): vals = signal(make_factor({"A": 10.0, "B": None, "C": 30.0})).df.sort("symbol")["factor"].to_list() - assert vals[1] == pytest.approx(0.0, abs=1e-10) + assert vals[1] == pytest.approx(0.0) class TestWinsorize: diff --git a/tests/test_math.py b/tests/test_math.py index d3394bf..4bb60b0 100644 --- a/tests/test_math.py +++ b/tests/test_math.py @@ -13,7 +13,7 @@ class TestLog: def test_natural_log(self): vals = log(make_factor({"A": math.e, "B": 1.0})).df.sort("symbol")["factor"].to_list() assert vals[0] == pytest.approx(1.0, rel=1e-6) - assert vals[1] == pytest.approx(0.0, abs=1e-10) + assert vals[1] == pytest.approx(0.0) def test_base_10(self): assert log(make_factor({"A": 100.0}), base=10.0).df["factor"][0] == pytest.approx(2.0, rel=1e-6) @@ -79,7 +79,7 @@ def test_negative(self): assert s_log_1p(make_factor({"A": -9.0})).df["factor"][0] == pytest.approx(-math.log(10.0), rel=1e-6) def test_zero(self): - assert s_log_1p(make_factor({"A": 0.0})).df["factor"][0] == pytest.approx(0.0, abs=1e-10) + assert s_log_1p(make_factor({"A": 0.0})).df["factor"][0] == pytest.approx(0.0) class TestMaximum: diff --git a/tests/test_neutralization.py b/tests/test_neutralization.py index f361b09..1f0c154 100644 --- a/tests/test_neutralization.py +++ b/tests/test_neutralization.py @@ -59,7 +59,7 @@ def test_mean_zero(self): def test_constant_returns_zero(self): vals = group_zscore(make_factor({"A": 5.0, "B": 5.0}), make_factor({"A": 1.0, "B": 1.0})).df.sort("symbol")["factor"].to_list() - assert vals == pytest.approx([0.0, 0.0], abs=1e-10) + assert vals == pytest.approx([0.0, 0.0]) class TestGroupScale: diff --git a/tests/test_timeseries.py b/tests/test_timeseries.py index 045550d..c743b9c 100644 --- a/tests/test_timeseries.py +++ b/tests/test_timeseries.py @@ -174,7 +174,7 @@ def test_min_max_normalize(self): assert _last(ts_scale(make_ts([2.0, 4.0, 6.0, 8.0, 10.0]), 5))[0] == pytest.approx(1.0) def test_constant_returns_zero(self): - assert _last(ts_scale(make_ts([5.0, 5.0, 5.0]), 3))[0] == pytest.approx(0.0, abs=1e-10) + assert _last(ts_scale(make_ts([5.0, 5.0, 5.0]), 3))[0] == pytest.approx(0.0) class TestTsPercentile: @@ -253,7 +253,7 @@ def test_op_md_example(self): assert vals[0] == pytest.approx((1+2+3+3) / (101+99+102+105), rel=1e-4) def test_constant_zero(self): - assert _last(inst_tvr(make_ts([10.0]*5), 5))[0] == pytest.approx(0.0, abs=1e-10) + assert _last(inst_tvr(make_ts([10.0]*5), 5))[0] == pytest.approx(0.0) class TestTsDeltaLimit: @@ -277,20 +277,20 @@ def _ols(self, y, x): sxy = sum(a*b for a, b in zip(x, y)) syy = sum(b*b for b in y) denom = n*sxx - sx*sx - beta = (n*sxy - sx*sy) / denom if abs(denom) > 1e-10 else 0.0 + beta = (n*sxy - sx*sy) / denom if denom != 0 else 0.0 alpha = (sy - beta*sx) / n sse = syy - alpha*sy - beta*sxy sst = syy - sy*sy/n return {"beta": beta, "alpha": alpha, "fitted": alpha + beta*x[-1], "resid": y[-1] - (alpha + beta*x[-1]), "sse": sse, "sst": sst, - "r2": 1 - sse/sst if abs(sst) > 1e-10 else 1.0} + "r2": 1 - sse/sst if sst != 0 else 1.0} def test_all_rettypes(self): y, x = [3.0, 5.0, 4.0, 8.0, 7.0], [1.0, 2.0, 3.0, 4.0, 5.0] e = self._ols(y, x) for rt, key in [(0, "resid"), (1, "alpha"), (2, "beta"), (3, "fitted"), (4, "sse"), (5, "sst"), (6, "r2")]: val = _last(ts_regression(make_ts(y), make_ts(x), 5, rettype=rt))[-1] - assert val == pytest.approx(e[key], rel=1e-6, abs=1e-10), f"rettype={rt}" + assert val == pytest.approx(e[key], rel=1e-6), f"rettype={rt}" def test_sse_sst_r2_identity(self): y_f, x_f = make_ts([3.0, 5.0, 4.0, 8.0, 7.0]), make_ts([1.0, 2.0, 3.0, 4.0, 5.0])