diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000..db19f01 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,54 @@ +name: Bug Report +description: Report incorrect behavior or numerical errors +labels: ["bug"] +body: + - type: textarea + id: description + attributes: + label: What happened? + description: Describe the bug. Include expected vs actual behavior. + placeholder: | + ts_mean([1, 2, 3], window=2) returns [None, None, 2.5] + Expected: [None, 1.5, 2.5] + validations: + required: true + + - type: textarea + id: reproduction + attributes: + label: Reproducible Example + render: python + placeholder: | + from elvers import load, ts_mean + + panel = load() + result = ts_mean(panel["close"], 5) + print(result.df) + validations: + required: true + + - type: dropdown + id: numerical + attributes: + label: Does this affect numerical output? + options: + - "Yes -- incorrect numerical values" + - "No -- crash, wrong type, or other non-numerical issue" + validations: + required: true + + - type: textarea + id: environment + attributes: + label: Environment + description: "Paste the output of: python -c \"import elvers; elvers.show_versions()\"" + render: text + placeholder: | + --------Elvers--------- + Elvers: 0.3.0 + Polars: 1.37.0 + Python: 3.12.0 + Platform: Linux-6.1.0-x86_64 + Architecture: x86_64 + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000..7879fd0 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,5 @@ +blank_issues_enabled: false +contact_links: + - name: Question + url: https://github.com/quantbai/elvers/discussions/new?category=q-a + about: Ask a question or start a discussion. diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 0000000..3feb4ac --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,26 @@ +name: Feature Request +description: Suggest an improvement or new capability +labels: ["enhancement"] +body: + - type: textarea + id: use_case + attributes: + label: Use Case + description: What problem does this solve? + placeholder: | + Computing returns requires verbose composition every time: + divide(ts_delta(close, 1), ts_delay(close, 1)) + validations: + required: true + + - type: textarea + id: proposed_api + attributes: + label: Proposed API + description: Show how it should work. + render: python + placeholder: | + from elvers import returns + ret = returns(close, window=1) + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/new_operator.yml b/.github/ISSUE_TEMPLATE/new_operator.yml new file mode 100644 index 0000000..06918d6 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/new_operator.yml @@ -0,0 +1,50 @@ +name: New Operator +description: Propose a new operator +labels: ["new operator"] +body: + - type: input + id: name + attributes: + label: Operator Name + description: "Naming: ts_ for time-series, group_ for group ops, no prefix for cross-sectional." + placeholder: "ts_entropy" + validations: + required: true + + - type: dropdown + id: module + attributes: + label: Module + options: + - "timeseries -- per-symbol rolling window" + - "cross_sectional -- across symbols per timestamp" + - "neutralization -- group operations" + - "math -- element-wise transforms" + - "base -- arithmetic and structural" + validations: + required: true + + - type: textarea + id: definition + attributes: + label: Definition and Use Case + description: Mathematical formula and why this operator is useful. + placeholder: | + H(X) = -sum(p_i * log(p_i)) + Measures randomness in a rolling window. Low entropy often precedes breakouts. + validations: + required: true + + - type: textarea + id: example + attributes: + label: Example + description: Input and expected output. + render: python + placeholder: | + from elvers import load + panel = load() + result = ts_entropy(panel["close"], window=3) + print(result.df.head()) + validations: + required: false diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 56a2c89..0163366 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -5,13 +5,35 @@ - [ ] Bug fix (corrects incorrect behavior) - [ ] New feature (new operator or functionality) - [ ] Refactor (no behavior change) -- [ ] Numerical change (alters factor computation results) [BREAKING] +- [ ] Documentation only +- [ ] Numerical change (alters factor computation results) **[BREAKING]** ## Numerical Impact - + + + ## Testing - [ ] Added or updated tests - [ ] All tests pass (`pytest tests/ -v`) - [ ] Lint passes (`ruff check elvers/`) +- [ ] Type check passes (`pyright elvers/`) + +## New Operator Checklist + +- [ ] Operator added to `ops/__init__.py` imports and `__all__` +- [ ] Docstring includes: description, parameters, return type, null behavior, warmup +- [ ] Divisions handle zero denominators (exact zero check or Inf → null via Factor) +- [ ] No Python loops in computation (Polars expressions only) +- [ ] Uses `ddof=0` for std/variance (population statistics) +- [ ] Uses `min_samples=window` for rolling operations +- [ ] Tests cover: basic correctness, null handling, edge cases +- [ ] `OPERATORS.md` updated with the new operator + +## Reviewer Notes + + + + + diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cac92d5..841b8ef 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,4 +19,5 @@ jobs: python-version: ${{ matrix.python-version }} - run: pip install -e ".[dev]" - run: ruff check elvers/ + - run: pyright elvers/ - run: pytest tests/ -v diff --git a/.gitignore b/.gitignore index 476eee0..7ec70e3 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,6 @@ venv/ *.swo .DS_Store .claude/ +.mypy_cache/ +.pytest_cache/ +.ruff_cache/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c97dbc..2be2469 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,27 @@ All notable changes to this project will be documented in this file. Format follows [Keep a Changelog](https://keepachangelog.com/). Numerical changes are marked with [NUMERICAL]. -## [Unreleased] +## [0.4.0] - 2026-03-26 + +### Added +- `show_versions()` for environment diagnostics in bug reports +- Pyright static type checking in CI pipeline +- Community templates: issue templates (bug, feature, new operator), CODE_OF_CONDUCT, SECURITY +- CONTRIBUTING.md with workflow, numerical invariants, and design rationale +- Timestamp type validation in loader (must be pl.Date or pl.Datetime) +- `_check_intervals()` warns about irregular timestamp spacing +- `Panel.gc()` to drop intermediate columns +- `Panel.select()` to export specific factors + +### Changed +- **Column-based Factor architecture.** Factor stores a column name + Panel reference instead of a full DataFrame. Eliminates all hash joins (24 removed), reduces memory by ~60% per Factor, ~2x faster on large panels. All data lives in Panel._df. +- [NUMERICAL] Removed arbitrary `1e-10` zero guards across all operators. Pure divisions (divide, inverse) now produce Inf → null via Panel._add_col. Statistical and regression operators use exact zero checks for degenerate cases (constant series). +- Replaced interval-based panel skeleton with union-based skeleton (no frequency inference) +- Restructured documentation: CLAUDE.md (AI memo), CONTRIBUTING.md (human guide), no duplication +- Removed `interval` parameter from `load()` +- Restructured into 12-step pipeline: core/, ops/, data/, universe/, analysis/, synthesis/, portfolio/, backtest/, risk/, execution/, monitor/ +- Panel moved from io/ to core/. Loader moved from io/ to data/. Sample data moved to data/sample/ +- Column-level memoization: _add_col skips computation if column already exists ## [0.3.0] - 2026-03-24 @@ -16,14 +36,14 @@ Numerical changes are marked with [NUMERICAL]. ### Changed - OPERATORS.md rewritten as pure operator reference manual (signatures, behavior, edge cases) -- Design rationale moved to CLAUDE.md Section 4.1 (developer-facing) +- Design rationale moved to CONTRIBUTING.md (Numerical Invariants section) - Fixed incorrect signatures in docs: `trade_when`, `scale`, `bucket` - Fixed README example code to use only columns present in sample data ## [0.2.0] - 2026-03-23 ### Added -- CLAUDE.md development standards (12 sections covering full workflow) +- CLAUDE.md development memo (architecture map, conduct rules, known limitations) - CI pipeline (GitHub Actions, pytest across Python 3.10-3.13) - Automated release pipeline (tag-triggered PyPI publish + GitHub Release) - Pre-commit hooks (ruff lint + format, pytest) @@ -31,7 +51,7 @@ Numerical changes are marked with [NUMERICAL]. - Dev dependencies (pytest, ruff, pre-commit) - `elvers/ops/_dev.py` for experimental operators - `elvers/ops/_validation.py` input validation helpers -- `load()` now accepts `interval` parameter for sub-daily data (e.g., "1h", "5m") +- `load()` supports any time frequency; panel skeleton built from timestamps present in data - Factor constructor validates required columns [timestamp, symbol, factor] - Tests for `divide()`, `reverse()`, `ts_product` (negative/zero), `ts_regression` (lag>0) @@ -42,7 +62,7 @@ Numerical changes are marked with [NUMERICAL]. - [NUMERICAL] `inverse()`: no zero protection; now returns null where abs(x) < 1e-10 - `ts_regression` rettype=7 (MSE): implicit Inf-to-null on window=2; now has explicit guard - `ts_count_nans`: used min_samples=1 unlike all other ts_* operators; aligned to min_samples=window -- `_balance()`: hardcoded daily frequency; now accepts interval parameter +- `_balance()`: replaced generated-range skeleton with union-based skeleton (no frequency inference) ### Changed - DEV operators (hump, ts_arg_max, ts_arg_min) moved from public API to `_dev.py` diff --git a/CLAUDE.md b/CLAUDE.md index acca1e2..5890446 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,334 +1,253 @@ -# Elvers Development Standards +# Elvers -- AI Development Memo -This document is the single source of truth for all development practices. -Any developer or LLM working on this project must follow these standards. +Polars-native quantitative research platform. This memo is for AI assistants working on the codebase. + +For contribution workflow, numerical invariants, and coding standards, see [CONTRIBUTING.md](CONTRIBUTING.md). +For operator specifications, see [OPERATORS.md](OPERATORS.md). --- -## 1. Project Overview +## Conduct -Elvers is a Polars-native multi-factor computation library serving as the core factor -calculation layer for a crypto hedge fund. Production-grade correctness is non-negotiable. +**When uncertain, ask. Do not guess.** Flag assumptions explicitly: "I am assuming X. Is that correct?" -- Language: Python 3.10+ -- Sole external dependency: polars>=1.37.0 -- Architecture: Panel (balanced container) -> Factor (immutable signal vector) -- PyPI: https://pypi.org/project/elvers/ -- Repository: https://github.com/quantbai/elvers +- **Design before implementing.** Think through module placement, API surface, and long-term implications before writing code. +- **Write cold, factual prose.** No emotional language. No marketing. State facts and trade-offs. +- **Describe upstream behavior, do not complain.** Say "Polars applies ddof asymmetrically; Elvers isolates this by using ddof=1" -- not "Polars has a bug". +- **`__init__.py` is for imports/exports only.** New functionality goes in dedicated modules. +- **Each module has one responsibility.** Do not let a module exceed its scope. If unsure where code belongs, ask. --- -## 2. Architecture +## Architecture ``` elvers/ - __init__.py Package root. __version__ is the single source of truth for version. - core/factor.py Factor class (immutable container: [timestamp, symbol, factor]) - io/loader.py Data loading and panel balancing (supports daily/hourly/minutely via interval param) - io/panel.py Balanced panel container - ops/base.py Arithmetic operators (add, subtract, multiply, divide, reverse, densify, bucket) - ops/timeseries.py Time-series operators (ts_mean, ts_rank, ts_regression, etc.) - ops/cross_sectional.py Cross-sectional operators (rank, zscore, scale, signal, etc.) - ops/math.py Mathematical operators (log, sqrt, power, inverse, etc.) - ops/neutralization.py Neutralization and group operators - ops/_dev.py Development/experimental operators (not exported, not production) - ops/_validation.py Input validation helpers + __init__.py Imports/exports only. __version__ is the single version source. + _meta.py Diagnostics (show_versions) + + core/ + factor.py Factor (column name + Panel ref, zero data storage) + panel.py Panel (single DataFrame, _add_col + memoization) + + ops/ Step 4: Factor computation (72 operators) + base.py Arithmetic + timeseries.py Time-series (per-symbol rolling window) + cross_sectional.py Cross-sectional (across symbols per timestamp) + math.py Math (element-wise) + neutralization.py Neutralization and group operators + _dev.py Experimental (not exported) + _validation.py Input validation helpers + + data/ Step 2+3: Data acquisition and storage + providers/ Exchange adapters (binance, okx, local) + store.py Parquet on disk (incremental update) + loader.py DataFrame -> Panel (schema validation + balance) + sample/ Built-in sample data + + universe/ Step 1: Instrument selection and filtering + analysis/ Step 5: IC, decay, turnover, coverage, correlation + synthesis/ Step 6: Orthogonalization, combination, selection + portfolio/ Step 7: Optimization, constraints + backtest/ Step 8: Unified signal -> PnL engine + risk/ Step 9: Exposure, limits, VaR + execution/ Step 10+11: Trading + post-trade analysis + monitor/ Step 12: Dashboard, alerts, logging + tests/ - conftest.py Test fixtures (make_ts, make_factor) - test_*.py One test file per operator module -OPERATORS.md Operator specification: numerical conventions, per-operator behavior, design rationale -CLAUDE.md Development standards (this file) + conftest.py Fixtures: make_ts, make_factor, make_panel_ts, make_panel_cs + test_*.py One file per module ``` ---- +## Core Patterns -## 3. Git Workflow +- Time-series operators: `expr.over("symbol")` +- Cross-sectional operators: `expr.over("timestamp")` +- All operators: `Factor -> Factor`, stateless, functional +- Binary/multi-factor operators: all factors must share the same Panel (`_check_panel`) +- Factor.name = column name in Panel._df = human-readable expression +- `_add_col` skips computation if column already exists (memoization) -### Branch Strategy +## Data Flow ``` -main (protected) -- production-ready, always passes CI, tagged releases come from here - dev -- integration branch, CI must be green before merge to main - feature/XXX -- short-lived feature branches off dev - fix/XXX -- bug fix branches off dev +data/providers -> data/store -> data/loader -> Panel + | +Panel["close"] -> Factor -> ops -> computed Factor + | + analysis (IC, decay, turnover) -> report + | + synthesis (orthogonalize -> combine) -> alpha Factor + | + backtest (signal -> PnL) -> metrics + | + execution (rebalance -> orders) -> trades ``` -### Daily Development (Step by Step) - -```bash -# 1. Start from latest dev -git checkout dev -git pull origin dev - -# 2. Create feature/fix branch -git checkout -b fix/bug-name # or feature/feature-name - -# 3. Make changes, then verify locally -ruff check elvers/ # lint -pytest tests/ -v # all tests must pass +Backtest accepts any `signal()` output: single factor, multi-factor composite, or portfolio-optimized weights. Same interface, same output format. -# 4. Commit (see Commit Convention below) -git add # never use git add -A -git commit -m "fix(ops): description" +--- -# 5. Push and create PR -git push -u origin fix/bug-name -# -> Create PR on GitHub: fix/bug-name -> dev -# -> Fill in PR template (change type, numerical impact, testing) -# -> CI runs automatically (lint + pytest x4 Python versions) +## Operator Template -# 6. After review approval and CI green: squash merge on GitHub -# 7. Delete the feature branch +```python +def ts_example(f: Factor, window: int) -> Factor: + name = f"ts_example({f.name},{window})" + expr = pl.col(f._col).rolling_mean(window, min_samples=window).over("symbol") + f.panel._add_col(expr, name) + return Factor(name, f.panel) ``` -### Branch Rules - -- Never force push to main or dev -- Never commit directly to main (always through PR from dev) -- Feature branches should live no longer than 1-2 days -- Each PR should be a single logical change +After implementing: +1. Add to `ops/__init__.py` imports and `__all__` +2. Add tests in matching `test_*.py` +3. Add entry to `OPERATORS.md` --- -## 4. Coding Standards - -### 4.1 Numerical Correctness (Highest Priority) - -Operator behavior reference: [OPERATORS.md](OPERATORS.md). -The rules below are for writing new code: - -- All divisions MUST have explicit zero guards: - `pl.when(denom.abs() < 1e-10).then(None).otherwise(num / denom)` -- NEVER rely on the Factor constructor's implicit Inf-to-null conversion as normal logic flow -- Null semantics: null propagates naturally through Polars expressions. Boundary cases - (zero denominator, constant window, insufficient data) must be handled explicitly - -#### Design Decisions (rationale for current conventions) +## Known Limitations -- **NaN/Inf unified to null**: eliminates the NaN-infection problem (`NaN + 1 = NaN`) - that silently corrupts downstream computations. The Factor constructor converts on - creation so the entire library operates on a single missing-value type. -- **ddof=0 everywhere**: rolling windows and cross-sections operate on the full observed - population, not a sample from a larger one. ddof=0 is semantically correct and avoids - n=1 division-by-zero (ddof=1 divides by n-1=0). -- **ts_corr/ts_autocorr use ddof=1 internally**: Polars `rolling_corr(ddof=0)` has a bug - where ddof only applies to the covariance numerator, not the variance denominator, - producing values outside [-1, 1]. Reported: https://github.com/pola-rs/polars/issues/16161. - Correlation is ddof-invariant (cancels in ratio), so ddof=1 output is correct. -- **rank range (0, 1] not [0, 1]**: a rank of 0 is ambiguous (could mean "missing" or - "lowest"). Strictly positive range ensures every ranked value is distinguishable from null. -- **Zero guard threshold 1e-10**: conservative enough to catch near-zero denominators, - small enough not to interfere with legitimate small values in financial data. -- **ts_product sign-magnitude decomposition**: naive `exp(sum(log(x)))` fails for negative - inputs because `log(x)` is undefined for x < 0. Separating sign and magnitude handles - this correctly. - -### 4.2 Operator Writing Rules - -- Time-series operators: always use `min_samples=window` for consistency -- Every new operator MUST include: functional test + null handling test + edge case test -- Docstrings MUST include: description, parameters, return type, null behavior, warmup period -- Naming: `ts_` prefix for time-series, `group_` prefix for group ops, no prefix for cross-sectional -- New operators must be added to `ops/__init__.py` exports and `__all__` - -### 4.3 Defensive Programming - -- Validate `window >= 1` and integer type at operator entry points -- Validate `0 <= q <= 1` for quantile parameters -- Validate Factor type on inputs (not raw DataFrame) -- Factor constructor validates required columns [timestamp, symbol, factor] -- Use validation helpers from `elvers/ops/_validation.py` - -### 4.4 Immutability - -- Factor objects are immutable after creation -- All operators return new Factor instances; never mutate inputs -- Factor._df and Factor._name are the only instance attributes (__slots__ enforced) - -### 4.5 Style - -- No emoji in code, comments, commit messages, or documentation -- English for all code, comments, and commit messages -- Ruff handles formatting and import sorting; run `ruff check --fix` before committing +- `trade_when`: sentinel value (-1.79e308) should become struct-based +- `_validation.py`: not wired into all operator entry points +- `_dev.py`: Python callbacks, not production-grade +- No property-based testing or performance benchmarks +- Column name collisions possible if same operator is called with identical name but different semantics +- Steps 1-3, 5-12 are scaffolded but not yet implemented --- -## 5. Testing Standards - -- Framework: pytest -- Config: pyproject.toml [tool.pytest.ini_options] -- Every public operator must have at least one test -- Numerical precision: use `pytest.approx(expected, abs=1e-10)` -- Test fixtures: `make_ts(vals, symbol)` and `make_factor(values, day)` from conftest.py -- Edge cases to cover: all-null input, single symbol, constant series (zero variance), window > data length -- Test file naming: `test_.py` matches `ops/.py` +## Next Steps (0.4.0 -> 0.5.0) ---- +Current status: Step 4 (Factor Computation) is complete with 72 operators. Column-based architecture, memoization, pyright CI all in place. Tests need rewrite for new architecture. -## 6. Commit Convention +Reference implementation: `ref/phandas_dev/` contains a pandas-based version with data providers, backtesting, analysis, and OKX execution. All modules below should be Polars-native rewrites, not pandas ports. -Format: `(): ` +### Immediate (0.4.0 release blocklist) -Types: -- fix: bug fix (state if it changes numerical output) -- feat: new operator or feature -- refactor: restructuring without behavior change -- test: test additions or modifications only -- perf: performance optimization -- docs: documentation only -- ci: CI/CD configuration -- release: version bump and CHANGELOG update (release commits only) +1. **Rewrite tests for column-based architecture.** Current tests use old Factor(DataFrame) pattern. Every test must use Panel + Factor(column_name, panel). Oracle cross-validation against NumPy/SciPy for complex operators. -Rules: -- Each commit is atomic: one logical change per commit -- Each commit must pass all tests (no "break then fix" sequences) -- Numerical changes must document the impact in the commit body with [NUMERICAL] tag -- No WIP, temp, or placeholder commits in main/dev branches +2. **Loader strictness.** `load()` should reject anything not matching the Panel contract. No hand-holding validation, no auto-conversion. Data cleaning is upstream's job. Accept: pl.DataFrame with (timestamp: Date|Datetime, symbol: Utf8, + numeric columns). Reject everything else with a clear error. -Example: -``` -fix(ops): ts_product handle negative inputs +### Phase 1: Analysis (Step 5) -log-sum-exp trick silently returned null for negative factor values. -Replaced with sign-magnitude decomposition: count negatives in window -to determine product sign, use exp(sum(log(abs(x)))) for magnitude. +Module: `elvers/analysis/` -[NUMERICAL] ts_product output changes for any window containing -negative values. Previously returned null, now returns correct product. +``` +analysis/ + __init__.py + ic.py Information Coefficient (Spearman rank-corr vs forward returns) + decay.py IC decay across horizons (1d, 2d, 5d, 10d, 20d) + turnover.py Factor turnover (rank change between periods) + coverage.py Non-null ratio per timestamp + correlation.py Factor-to-factor correlation matrix + report.py Aggregated FactorReport object ``` ---- - -## 7. Code Review Rules - -- When the team has multiple developers, enable "Require approvals" in branch protection -- Currently (single-developer mode): CI status checks are required, review approval is optional -- Reviewer must verify: - 1. Tests pass and cover the change - 2. Numerical correctness (manually verify at least one expected value) - 3. Zero-guard pattern followed for all divisions - 4. No implicit Inf-to-null reliance -- PRs that change numerical output must include before/after comparison in PR description -- PRs that add new operators must include the operator in `__all__` exports - ---- - -## 8. Versioning - -Format: MAJOR.MINOR.PATCH (Semantic Versioning) - -| Change Type | Version Bump | Example | -|-------------|-------------|---------| -| Bug fix, no numerical change | PATCH (0.0.x) | Add input validation | -| Bug fix that changes numerical output | MINOR (0.x.0) | Fix ts_product for negatives | -| New operator or feature | MINOR (0.x.0) | Add ts_entropy operator | -| Breaking API change | MAJOR (x.0.0) | Rename Factor to Signal | +Input: Factor + price Factor + horizons list. +Output: FactorReport with .ic(), .decay(), .turnover(), .coverage(), .correlation(). +Reference: `ref/phandas_dev/phandas/phandas/analysis.py` -Version number lives in one place only: `elvers/__init__.py` -> `__version__`. -pyproject.toml reads it dynamically via `[tool.setuptools.dynamic]`. +### Phase 2: Synthesis (Step 6) ---- +Module: `elvers/synthesis/` -## 9. Release Process +``` +synthesis/ + __init__.py + orthogonalize.py Gram-Schmidt or regression residual decorrelation + combine.py Equal-weight, IC-weighted, optimized-weight combination + select.py Factor selection by IC threshold, turnover filter, coverage filter +``` -Releases are triggered by git tags. CI automatically: runs tests, builds package, -publishes to PyPI, and creates a GitHub Release with auto-generated release notes. +Phandas only does arithmetic combination (`(f1 + f2 + f3) / 3`). A top fund adds: +- **Orthogonalization**: remove redundancy between factors before combining +- **IC-weighted**: weight by rolling IC (factors with higher predictive power get more weight) +- **Selection**: drop factors below IC threshold or with excessive turnover -### Step-by-Step Release +### Phase 3: Backtest (Step 8) -```bash -# 1. Make sure dev is clean and CI passes -git checkout dev -git pull origin dev -pytest tests/ -v -ruff check elvers/ +Module: `elvers/backtest/` -# 2. Update version number (single source of truth) -# Edit elvers/__init__.py: __version__ = "X.Y.Z" +``` +backtest/ + __init__.py + engine.py Signal -> position weights -> daily PnL + cost.py Transaction cost models (fixed, linear, market impact) + metrics.py Sharpe, Sortino, Calmar, max drawdown, turnover, linearity + report.py BacktestReport with .summary(), .equity(), .drawdowns() +``` -# 3. Update CHANGELOG.md -# Move items from [Unreleased] to [X.Y.Z] - YYYY-MM-DD +Phandas supports dollar-neutral only. Add: +- Long-only mode +- Custom weight functions (not just equal-weight signal) +- Multi-period holding (not just daily rebalance) +- Configurable cost models (fixed rate, linear impact, square-root impact) -# 4. Commit the release -git add elvers/__init__.py CHANGELOG.md -git commit -m "release: vX.Y.Z" -git push origin dev +Reference: `ref/phandas_dev/phandas/phandas/backtest.py` -# 5. Create PR: dev -> main on GitHub -# Title: "release: vX.Y.Z" -# Wait for CI to pass and review approval -# Squash merge on GitHub +### Phase 4: Risk (Step 9) -# 6. Tag on main (after PR merged) -git checkout main -git pull origin main -git tag vX.Y.Z -git push origin vX.Y.Z +Module: `elvers/risk/` -# 7. Automated (triggered by tag push): -# - CI runs full test suite again -# - Builds sdist and wheel (python -m build) -# - Publishes to PyPI via Trusted Publisher -# - Creates GitHub Release with auto-generated notes -# - Attaches built artifacts to the Release +``` +risk/ + __init__.py + exposure.py Factor exposure decomposition (market, sector, style) + limits.py Position limits, concentration limits, turnover limits + var.py Value-at-Risk (historical, parametric) ``` -### What Happens Automatically - -When you push a tag like `vX.Y.Z`: - -1. `.github/workflows/publish.yml` triggers -2. Runs full test suite on Python 3.10-3.13 (safety net) -3. If tests pass: builds package, publishes to PyPI -4. Creates a GitHub Release page at github.com/quantbai/elvers/releases - with auto-generated release notes from commit messages -5. Users can now `pip install elvers==X.Y.Z # specific version` - -### What You See on GitHub After Release +### Phase 5: Execution (Step 10+11) -- **Releases** section (right sidebar) shows all published versions -- Each release has: tag name, release notes (from commits), downloadable artifacts -- **Packages** section: ignore this, we use PyPI not GitHub Packages +Module: `elvers/execution/` ---- +``` +execution/ + __init__.py + rebalancer.py Target weights -> order list (exchange-agnostic) + adapters/ + okx.py OKX perpetual swap adapter + binance.py Binance adapter + twap.py TWAP/VWAP execution algorithms + post_trade.py Slippage analysis, execution quality +``` -## 10. Setup +Reference: `ref/phandas_dev/phandas/phandas/trader.py` and `ref/phandas_dev/jinglabs/` -### Infrastructure (already configured) +Note: Execution adapters contain API credentials structure. The adapter interface is public; credential handling and proprietary execution logic are private. -- PyPI Trusted Publisher: configured for quantbai/elvers -> publish.yml -- GitHub Branch Protection on main: require PR, require CI status checks -- GitHub Actions: ci.yml (push/PR) + publish.yml (tag-triggered release) +### Phase 6: Data + Universe (Steps 1-3) -### Local Development Setup (every new developer) +Module: `elvers/data/`, `elvers/universe/` -```bash -git clone https://github.com/quantbai/elvers.git -cd elvers -git checkout dev -pip install -e ".[dev]" -pre-commit install +``` +data/ + providers/ + base.py Provider interface + binance.py Binance OHLCV + okx.py OKX perpetual swap data + local.py Local parquet/csv + store.py Incremental parquet storage + loader.py DataFrame -> Panel (strict validation + balance) + sample/ Built-in sample data (crypto_1d.parquet) + +universe/ + __init__.py + filter.py Liquidity, volume, listing date filters + groups.py Sector/category classification ``` ---- +Reference: `ref/phandas_dev/phandas/phandas/providers/` and `ref/phandas_dev/phandas/phandas/data.py` -## 11. Quick Reference +### Phase 7: Monitoring (Step 12) -```bash -pip install -e ".[dev]" # Setup -pre-commit install # Git hooks -pytest tests/ -v # Run all tests -pytest tests/test_timeseries.py::TestTsProduct -v # Single test class -ruff check elvers/ --fix # Lint + auto-fix -ruff format elvers/ # Format code -``` +Module: `elvers/monitor/` ---- +Deferred until execution layer is stable. Will include position monitoring, PnL tracking, and alerting. -## 12. Known Limitations / Future Work +### Open Questions (discuss with maintainer) -- `trade_when` uses a sentinel value (-1.79e308) for exit signals; should be replaced with struct-based approach -- Input validation (`_validation.py`) is defined but not yet wired into all operator entry points -- No property-based testing (Hypothesis); would strengthen numerical correctness guarantees -- No performance benchmarks; should establish baseline timings for core operators -- `_dev.py` operators (hump, ts_arg_max, ts_arg_min) use Python callbacks and are not production-grade +1. Execution layer: public or private? The adapter interface is safe to open-source. Proprietary TWAP/execution logic may stay private. +2. Synthesis depth: start with equal-weight + IC-weighted, or include PCA/ML from day one? +3. Backtest scope: dollar-neutral only (crypto standard), or support long-only and long-short? +4. Data providers: which exchanges to support first? Binance + OKX from phandas, or add more? diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..a36e841 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,7 @@ +# Code of Conduct + +This project follows the [Contributor Covenant v2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct/). + +## Reporting + +Report unacceptable behavior via [GitHub private reporting](https://github.com/quantbai/elvers/security/advisories/new) or by contacting the maintainers through a private GitHub issue. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..04f0498 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,180 @@ +# Contributing to Elvers + +> **Pre-1.0 development.** API is stabilizing but may change between minor versions. +> Operator coverage and numerical conventions are under active refinement. + +--- + +## Getting Started + +```bash +# Fork on GitHub, then: +git clone https://github.com//elvers.git +cd elvers +git remote add upstream https://github.com/quantbai/elvers.git +git checkout dev +pip install -e ".[dev]" +pre-commit install +``` + +Verify: + +```bash +pytest tests/ -v +ruff check elvers/ +pyright elvers/ +``` + +--- + +## Workflow + +### Branches + +``` +main (protected) -- tagged releases only + dev -- integration, CI must pass + feature/XXX -- new operators or features + fix/XXX -- bug fixes +``` + +All changes enter through pull requests to `dev`. + +### Development Cycle + +```bash +git checkout dev && git pull upstream dev # 1. sync +git checkout -b feature/ts-entropy # 2. branch +# ... implement ... +ruff check elvers/ && pyright elvers/ && pytest tests/ -v # 3. verify +git add # 4. stage (never git add -A) +git commit -m "feat(ops): add ts_entropy" # 5. commit +git push -u origin feature/ts-entropy # 6. push +# 7. open PR -> dev +``` + +--- + +## Adding a New Operator + +### 1. Choose the Module + +| Prefix | File | Scope | +|--------|------|-------| +| `ts_` | `ops/timeseries.py` | Per-symbol rolling window | +| (none) | `ops/cross_sectional.py` | Across symbols per timestamp | +| `group_` | `ops/neutralization.py` | Within groups per timestamp | +| (none) | `ops/math.py` | Element-wise math | +| (none) | `ops/base.py` | Arithmetic and structural | + +### 2. Implement + +Adhere to the numerical invariants below and follow existing patterns in the target module. + +### 3. Export + +Add to `elvers/ops/__init__.py`: import line and `__all__` list. + +### 4. Test + +Add tests in `tests/test_.py`: + +- **Correctness**: verify against expected values (`pytest.approx`). For complex operators (regression, covariance, decay), cross-validate against NumPy or SciPy on randomized inputs. +- **Null propagation**: null inputs produce correct null outputs +- **Edge cases**: constant series, all-null, window > data length, zero denominators + +### 5. Document + +Add entry to [OPERATORS.md](OPERATORS.md). + +### 6. Pre-PR Checklist + +- [ ] `ruff check elvers/` passes +- [ ] `pyright elvers/` passes +- [ ] `pytest tests/ -v` passes (full suite) +- [ ] Operator in `__init__.py` exports and `__all__` +- [ ] Docstring: description, parameters, return type, null behavior, warmup +- [ ] Divisions handle zero denominators (exact zero check or Inf → null via Factor) +- [ ] No Python loops in computation +- [ ] Tests cover: correctness, null, edge cases +- [ ] `OPERATORS.md` updated + +--- + +## Numerical Invariants + +Elvers maintains the following invariants. + +- NaN and Inf are unified to null. Single missing-value type throughout. +- Pure division (divide, inverse): no guard. Inf → null via Panel._add_col. +- Population statistics: ddof=0 for std, variance, covariance. +- Rank range: (0, 1]. Ties: `average`. Zero excluded. +- Rolling warmup: first `window - 1` values per symbol are null. + +Full conventions: [OPERATORS.md](OPERATORS.md). + +### Design Rationale + +| Decision | Rationale | Trade-off | +|----------|-----------|-----------| +| NaN/Inf -> null | Eliminates NaN infection (`NaN + 1 = NaN`). One missing-value type simplifies null-propagation logic. | Loses the distinction between missing data and computation overflow. Acceptable for daily/hourly factor research where Inf indicates a defect, not a signal. May need revisiting for tick-level microstructure data. | +| ddof=0 | Rolling windows observe the full population within the window, not a sample. Avoids n=1 division-by-zero. | ddof=1 would be appropriate for random samples. Deterministic lookbacks are not random samples. | +| Rank (0, 1] | Zero is ambiguous (missing or lowest?). Strictly positive values are distinguishable from null. | Downstream code cannot assume [0, 1] range. | +| Division by zero | Pure divisions (divide, inverse) have no guard; Inf → null via Factor constructor. Statistical operators (zscore, scale, etc.) check for exact zero denominators and return semantic defaults (e.g., 0.0 for constant series). | Near-zero denominators produce large but finite values. Use winsorize/truncate to handle outliers. | +| ts_corr uses ddof=1 internally | Polars `rolling_corr(ddof=0)` applies ddof only to the covariance numerator, not the variance denominator, producing values outside [-1, 1] ([polars#16161](https://github.com/pola-rs/polars/issues/16161)). Elvers isolates this by using ddof=1, which is mathematically equivalent for correlation (ddof cancels in the ratio). | Will align to ddof=0 when the upstream issue is resolved. | + +--- + +## Style + +- Ruff for formatting and linting (line-length = 120) +- Pyright for static type checking (zero errors required in CI) +- English for all code, comments, commits, and documentation +- No emoji + +### Commits + +Format: `(): ` + +| Type | Use | +|------|-----| +| feat | New operator or feature | +| fix | Bug fix | +| refactor | No behavior change | +| test | Tests only | +| perf | Performance | +| docs | Documentation | +| ci | CI/CD | + +Numerical changes require a `[NUMERICAL]` tag in the commit body with impact description. + +--- + +## Pull Requests + +- One PR = one logical change +- Target: `dev` branch +- CI must pass on Python 3.10, 3.11, 3.12, 3.13 + +Numerical output changes require before/after comparison in the PR description. + +### Review Criteria + +1. Tests pass and cover the change +2. Numerical outputs cross-validated (simple operators: known values; complex operators: NumPy/SciPy oracle) +3. Division-by-zero handled (exact zero check or Inf -> null via Panel) +4. No implicit Inf-to-null reliance +5. Operator exported and documented + +--- + +## Versioning + +MAJOR.MINOR.PATCH ([SemVer](https://semver.org/)). + +| Change | Bump | +|--------|------| +| Bug fix, no numerical change | PATCH | +| Numerical output change | MINOR | +| New operator | MINOR | +| Breaking API change | MAJOR | diff --git a/OPERATORS.md b/OPERATORS.md index 42d7a85..03b84fa 100644 --- a/OPERATORS.md +++ b/OPERATORS.md @@ -10,10 +10,10 @@ | --- | --- | | Missing values | NaN and Inf are converted to null on Factor creation. Single missing-value type (null) throughout. | | Null arithmetic | `5.0 + null = null`. Use `filter=True` on add/subtract/multiply to treat null as identity (0 for +/-, 1 for *). | -| Division by zero | All divisions guarded at `abs(denom) < 1e-10`, returning null. | +| Division by zero | Pure divisions (divide, inverse): no guard, Inf → null via Factor. Statistical/regression operators: exact zero check on denominator, returns semantic default (0.0 for zscore, null for CV, etc.). | | Std / Variance | ddof=0 (population) for all std, variance, covariance, zscore, normalize, winsorize. | | Correlation | ddof-invariant. `ts_covariance(x,y,w) / (ts_std_dev(x,w) * ts_std_dev(y,w)) == ts_corr(x,y,w)`. | -| Rank | Range (0, 1]. Does not include zero. Ties: `average` method. Null excluded. | +| Rank | Range (0, 1]. Does not include zero. Ties: `average` method. Null excluded. Applies to: `rank`, `ts_rank`, `group_rank`, `quantile`. Exception: `densify` uses `dense` (consecutive integers). | | Rolling warmup | All `ts_*` operators: first `window-1` values per symbol are null (`min_samples=window`). | --- @@ -40,7 +40,7 @@ Element-wise multiplication. `filter=True`: null treated as 1. ### `divide(a, b)` -Element-wise division. Returns null where `abs(b) < 1e-10`. +Element-wise division. Division by zero → Inf → null via Factor. ### `reverse(x)` @@ -110,7 +110,7 @@ Rolling excess kurtosis. Fisher definition (normal = 0). ### `ts_zscore(x, window)` -`(x - rolling_mean) / rolling_std`. ddof=0. Returns 0 if `std < 1e-10`. +`(x - rolling_mean) / rolling_std`. ddof=0. Returns 0 if `std = 0`. ### `ts_corr(x, y, window)` @@ -152,7 +152,7 @@ Periods since last value change. Returns 0 on change. No warmup. ### `ts_scale(x, window)` -Min-max normalization: `(x - min) / (max - min)`. Range: [0, 1]. Returns 0 if range < 1e-10. +Min-max normalization: `(x - min) / (max - min)`. Range: [0, 1]. Returns 0 if range = 0. ### `ts_percentile(x, window, q)` @@ -164,7 +164,7 @@ Rolling percentile at quantile `q` in [0, 1]. ### `ts_cv(x, window)` -Coefficient of variation: `std / abs(mean)`. ddof=0. Returns null if `abs(mean) < 1e-10`. +Coefficient of variation: `std / abs(mean)`. ddof=0. Returns null if mean = 0. ### `ts_autocorr(x, window, lag=1)` @@ -188,7 +188,7 @@ Most recent value in lookback that differs from current. Null if none. ### `inst_tvr(x, window)` -Instrument turnover: `sum(abs(delta)) / sum(abs(x))`. Returns 0 if denom < 1e-10. +Instrument turnover: `sum(abs(delta)) / sum(abs(x))`. Returns 0 if denom = 0. ### `ts_delta_limit(x, y, limit_volume=0.1)` @@ -211,7 +211,7 @@ Rolling OLS regression. | 8 | Std error of beta | | 9 | Std error of alpha | -Zero guard on `sum(x^2) < 1e-10` and `SST < 1e-10`. +Degenerate cases: beta = 0 if `var(x) = 0`, R² = 1 if `SST = 0`. ### `trade_when(trigger, alpha, exit_cond)` @@ -231,7 +231,7 @@ Example: `(4, 3, 6, 10, 2)` -> `(0.6, 0.4, 0.8, 1.0, 0.2)` ### `zscore(x)` -`(x - mean) / std`. ddof=0. Returns 0 if `std < 1e-10`. +`(x - mean) / std`. ddof=0. Returns 0 if `std = 0`. ### `mean(x)` @@ -243,7 +243,7 @@ Cross-sectional median, broadcast to all symbols. ### `scale(x, target=1.0, longscale=0.0, shortscale=0.0)` -Scale so `sum(abs(x)) = target`. When `longscale`/`shortscale` are non-zero, scale long and short legs separately. Returns 0 if sum < 1e-10. +Scale so `sum(abs(x)) = target`. When `longscale`/`shortscale` are non-zero, scale long and short legs separately. Returns 0 if sum = 0. ### `normalize(x, use_std=False, limit=0.0)` @@ -255,7 +255,7 @@ Rank then inverse CDF. Drivers: `gaussian`, `uniform`, `cauchy`. Acklam approxim ### `signal(x)` -Zero-mean, unit-absolute-sum normalization. Returns 0 if `abs_sum < 1e-10` or `count < 2`. +Zero-mean, unit-absolute-sum normalization. Returns 0 if `abs_sum = 0` or `count < 2`. ### `winsorize(x, std=4)` @@ -279,11 +279,11 @@ Null values below `minimum`. ### `vector_neut(x, y)` -Orthogonal residual: `x - proj_y(x)`. Returns `x` if `dot(y,y) < 1e-10`. +Orthogonal residual: `x - proj_y(x)`. Returns `x` if `dot(y,y) = 0`. ### `regression_neut(y, x)` -OLS residual: `y - (alpha + beta * x)`. beta = 0 if `var(x) < 1e-10`. +OLS residual: `y - (alpha + beta * x)`. beta = 0 if `var(x) = 0`. ### `group_neutralize(x, group)` @@ -295,19 +295,19 @@ Rank within group. (0, 1], `average`, null excluded. ### `group_zscore(x, group)` -Z-score within group. ddof=0. Returns 0 if group std < 1e-10. +Z-score within group. ddof=0. Returns 0 if group std = 0. ### `group_scale(x, group)` -Min-max within group to [0, 1]. Returns 0 if range < 1e-10. +Min-max within group to [0, 1]. Returns 0 if range = 0. ### `group_normalize(x, group, target=1)` -Scale within group so `sum(abs(x)) = target`. Returns 0 if sum < 1e-10. +Scale within group so `sum(abs(x)) = target`. Returns 0 if sum = 0. ### `group_mean(x, group, weight=None)` -Group mean. With weight: weighted mean, falls back to unweighted if `sum(weight) < 1e-10`. +Group mean. With weight: weighted mean, falls back to unweighted if `sum(weight) = 0`. ### `group_median(x, group)` @@ -343,7 +343,7 @@ Square root. Null for negative inputs. For sign-preserving: `signed_power(x, 0.5 ### `inverse(x)` -`1 / x`. Returns null where `abs(x) < 1e-10`. +`1 / x`. Division by zero → Inf → null via Factor. ### `s_log_1p(x)` diff --git a/README.md b/README.md index 4a89603..8b83a00 100644 --- a/README.md +++ b/README.md @@ -10,12 +10,34 @@ -Polars-native factor computation engine for quantitative research. All operators execute as Rust-backed Polars expressions with no Python loops in the hot path. +Polars-native quantitative research platform. All computation executes as Rust-backed Polars expressions with no Python loops in the hot path. -## Core Abstractions +> **Pre-1.0 development.** API is stabilizing but may have breaking changes between minor versions. See [CHANGELOG.md](CHANGELOG.md). -- **`Panel`** -- Balanced `(timestamp, symbol)` container with strict alignment guarantees. Prevents look-ahead bias by construction. -- **`Factor`** -- Immutable signal vector. Every operator takes `Factor` and returns `Factor` with eager evaluation. +## Pipeline + +Elvers covers the full quantitative research workflow: + +| Step | Module | Status | +|------|--------|--------| +| 1. Universe Selection | `universe/` | Planned | +| 2. Data Acquisition | `data/providers/` | Planned | +| 3. Data Storage | `data/store.py` | Planned | +| 4. Factor Computation | `ops/` | **72 operators** | +| 5. Factor Analysis | `analysis/` | Planned | +| 6. Multi-Factor Synthesis | `synthesis/` | Planned | +| 7. Portfolio Construction | `portfolio/` | Planned | +| 8. Backtesting | `backtest/` | Planned | +| 9. Risk Management | `risk/` | Planned | +| 10. Execution | `execution/` | Planned | +| 11. Post-Trade Analysis | `execution/post_trade.py` | Planned | +| 12. Monitoring | `monitor/` | Planned | + +## Design + +- **Column-based.** Factor stores a column name + Panel reference. All data lives in a single Panel DataFrame. Zero intermediate copies, zero hash joins. Same-expression memoization. +- **Balanced alignment.** Panel guarantees every symbol shares the same timestamp index. Missing values are explicit nulls. Look-ahead bias is prevented by construction. +- **Single null semantics.** NaN and Inf are converted to null on column creation. One missing-value type eliminates NaN-infection. ## Installation @@ -26,28 +48,18 @@ pip install elvers ## Usage ```python -from elvers import load, ts_rank, ts_regression, zscore, signal - -panel = load() # built-in sample data (crypto 1d OHLCV) -close, volume = panel["close"], panel["volume"] +from elvers import load, ts_rank, zscore, signal -momentum = ts_rank(close, 20) -vol_adj = zscore(momentum) / zscore(ts_rank(volume, 20)) -beta_resid = ts_regression(close, volume, window=60, rettype=0) -alpha = signal(vol_adj) -``` - -Sub-daily data is supported via the `interval` parameter: - -```python -panel = load("hourly.parquet", interval="1h") +panel = load() +close = panel["close"] +alpha = signal(zscore(ts_rank(close, 30))) ``` ## Operators 72 operators. All accept and return `Factor`. -**Time-Series** -- rolling window per symbol: +**Time-Series** -- per-symbol rolling window: `ts_delay` `ts_delta` `ts_mean` `ts_sum` `ts_std_dev` `ts_min` `ts_max` `ts_median` `ts_rank` `ts_skewness` `ts_kurtosis` `ts_zscore` `ts_corr` `ts_covariance` `ts_product` `ts_step` `ts_decay_linear` `ts_decay_exp_window` `days_from_last_change` `ts_av_diff` `ts_scale` `ts_percentile` `ts_quantile` `ts_cv` `ts_autocorr` `ts_count_nans` `ts_backfill` `kth_element` `last_diff_value` `inst_tvr` `ts_delta_limit` `ts_regression` `trade_when` @@ -55,11 +67,11 @@ panel = load("hourly.parquet", interval="1h") `rank` `zscore` `mean` `median` `scale` `normalize` `quantile` `signal` `winsorize` `truncate` `left_tail` `right_tail` -**Neutralization and Group** -- sector/industry neutralization: +**Neutralization and Group** -- sector/industry control: `vector_neut` `regression_neut` `group_neutralize` `group_rank` `group_zscore` `group_scale` `group_normalize` `group_mean` `group_median` `group_backfill` -**Math**: +**Math** -- element-wise transforms: `log` `sqrt` `sign` `power` `signed_power` `inverse` `s_log_1p` `maximum` `minimum` `where` @@ -67,7 +79,7 @@ panel = load("hourly.parquet", interval="1h") `add` `subtract` `multiply` `divide` `reverse` `densify` `bucket` and standard operators (`+` `-` `*` `/` `**` `abs`) -Full specifications and numerical conventions: **[OPERATORS.md](OPERATORS.md)** +Full specifications: **[OPERATORS.md](OPERATORS.md)** ## Development @@ -75,6 +87,7 @@ Full specifications and numerical conventions: **[OPERATORS.md](OPERATORS.md)** pip install -e ".[dev]" pytest tests/ -v ruff check elvers/ +pyright elvers/ ``` -See [CLAUDE.md](CLAUDE.md) for full development standards. +See [CONTRIBUTING.md](CONTRIBUTING.md) for the contributor guide. diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..e9c4dc9 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,21 @@ +# Security Policy + +## Supported Versions + +| Version | Supported | +|---------|-----------| +| 0.4.x | Yes | +| 0.3.x | No | +| < 0.3 | No | + +## Reporting a Vulnerability + +**Do not open a public issue for security vulnerabilities.** + +Report via [GitHub Private Vulnerability Reporting](https://github.com/quantbai/elvers/security/advisories/new). + +You will receive a response within 72 hours. + +## Scope + +Elvers is a computation library with no network access, no file system writes, and no user authentication. The primary security concern is **numerical correctness** -- if an operator produces silently incorrect results, report it as a high-priority bug via a public issue. diff --git a/elvers/__init__.py b/elvers/__init__.py index b2eac87..6af1e05 100644 --- a/elvers/__init__.py +++ b/elvers/__init__.py @@ -4,12 +4,13 @@ Author: quantbai """ -__version__ = "0.3.0" +__version__ = "0.4.0" __author__ = "quantbai" -from .core import Factor -from .io import Panel, load +from ._meta import show_versions +from .core import Factor, Panel +from .data import load from .ops import * # noqa: F401,F403 from .ops import __all__ as _ops_all -__all__ = ["__version__", "__author__", "Factor", "load", "Panel"] + list(_ops_all) +__all__ = ["__version__", "__author__", "Factor", "load", "Panel", "show_versions"] + list(_ops_all) diff --git a/elvers/_meta.py b/elvers/_meta.py new file mode 100644 index 0000000..944bdb4 --- /dev/null +++ b/elvers/_meta.py @@ -0,0 +1,33 @@ +"""Package diagnostics for bug reports.""" + +from __future__ import annotations + +import platform +import sys + +import polars as pl + +from . import __version__ + + +def show_versions() -> str: + """Return version and environment info as a formatted string. + + Also prints to stdout for convenience. + + Returns + ------- + str + Multi-line version info block, suitable for pasting into bug reports. + """ + lines = [ + "--------Elvers---------", + f"Elvers: {__version__}", + f"Polars: {pl.__version__}", + f"Python: {sys.version}", + f"Platform: {platform.platform()}", + f"Architecture: {platform.machine()}", + ] + text = "\n".join(lines) + print(text) + return text diff --git a/elvers/analysis/__init__.py b/elvers/analysis/__init__.py new file mode 100644 index 0000000..d4574e5 --- /dev/null +++ b/elvers/analysis/__init__.py @@ -0,0 +1 @@ +"""Single-factor analysis: IC, decay, turnover, coverage.""" diff --git a/elvers/backtest/__init__.py b/elvers/backtest/__init__.py new file mode 100644 index 0000000..24b6d53 --- /dev/null +++ b/elvers/backtest/__init__.py @@ -0,0 +1 @@ +"""Backtesting engine with unified signal interface.""" diff --git a/elvers/core/__init__.py b/elvers/core/__init__.py index 11df912..2f6a886 100644 --- a/elvers/core/__init__.py +++ b/elvers/core/__init__.py @@ -1,7 +1,6 @@ -"""Core module exports.""" +"""Core objects: Factor and Panel.""" from .factor import Factor +from .panel import Panel -__all__ = [ - "Factor", -] +__all__ = ["Factor", "Panel"] diff --git a/elvers/core/factor.py b/elvers/core/factor.py index 478304a..1591524 100644 --- a/elvers/core/factor.py +++ b/elvers/core/factor.py @@ -1,84 +1,86 @@ -"""Factor class - strictly-typed Polars DataFrame wrapper.""" +"""Factor class - column-based reference into a Panel DataFrame.""" from __future__ import annotations +from typing import TYPE_CHECKING + import polars as pl +if TYPE_CHECKING: + from .panel import Panel + class Factor: - """Eager factor container backed by pl.DataFrame[timestamp, symbol, factor]. + """Column-based factor reference backed by a shared Panel. - Every operator receives Factor(s) and returns a Factor with data - immediately materialized. + Factor stores only a column name and a reference to its parent Panel. + All data lives in Panel._df. Operations add computed columns to the + Panel rather than creating intermediate DataFrames. Note: __eq__ and __ne__ return Factor (element-wise comparison), not bool. - This matches pandas semantics and is required for conditional operators - like where(). As a consequence, Factor is not hashable and cannot be - used as dict keys or in sets. + Factor is not hashable and cannot be used as dict keys or in sets. """ - __slots__ = ("_df", "_name") - __hash__ = None - - def __init__(self, df: pl.DataFrame, name: str): - required = {"timestamp", "symbol", "factor"} - missing = required - set(df.columns) - if missing: - raise ValueError(f"Factor DataFrame missing required columns: {missing}. Got: {list(df.columns)}") - sorted_df = df.sort(["symbol", "timestamp"]) - if "factor" in sorted_df.columns and sorted_df.schema["factor"] in (pl.Float32, pl.Float64): - col = pl.col("factor") - sorted_df = sorted_df.with_columns( - pl.when(col.is_nan() | col.is_infinite()).then(None).otherwise(col).alias("factor") - ) - self._df = sorted_df - self._name = name + __slots__ = ("_col", "_panel") + __hash__: None # type: ignore[assignment] - @property - def df(self) -> pl.DataFrame: - return self._df + def __init__(self, col: str, panel: Panel): + self._col = col + self._panel = panel @property def name(self) -> str: - return self._name + return self._col @name.setter - def name(self, value: str): - self._name = value + def name(self, value: str) -> None: + self._col = value + + @property + def panel(self) -> Panel: + return self._panel + + @property + def expr(self) -> pl.Expr: + return pl.col(self._col) + + @property + def df(self) -> pl.DataFrame: + """Reconstruct [timestamp, symbol, factor] for backward compatibility.""" + return self._panel._df.select( + "timestamp", "symbol", pl.col(self._col).alias("factor") + ).sort(["symbol", "timestamp"]) def __str__(self) -> str: - preview_df = self._df.sort(["timestamp", "symbol"]) - return f"Factor: {self._name}\n{preview_df}" + return f"Factor: {self._col}\n{self.df}" def __repr__(self) -> str: - preview_df = self._df.sort(["timestamp", "symbol"]) - return f"Factor({self._name}, rows={len(self._df)})\n{preview_df.__repr__()}" - - def _resolve_other(self, other: Factor | int | float): - if isinstance(other, Factor): - left = self._df.rename({"factor": "_l"}) - right = other._df.select(["timestamp", "symbol", pl.col("factor").alias("_r")]) - merged = left.join(right, on=["timestamp", "symbol"], how="inner") - return merged, "_l", "_r", other._name - return self._df, "factor", None, str(other) + n = len(self._panel._df) + return f"Factor({self._col}, rows={n})" + + def _check_panel(self, other: Factor) -> None: + if self._panel is not other._panel: + raise ValueError( + f"Cannot combine factors from different panels: " + f"'{self._col}' and '{other._col}'" + ) - def _binary(self, other, op, op_sym) -> Factor: + def _binary(self, other: Factor | int | float, op, op_sym: str) -> Factor: # type: ignore[type-arg] if isinstance(other, Factor): - merged, lc, rc, oname = self._resolve_other(other) - result = merged.with_columns( - op(pl.col(lc), pl.col(rc)).alias("factor") - ).select(["timestamp", "symbol", "factor"]) - return Factor(result, f"({self._name}{op_sym}{oname})") - result = self._df.with_columns( - op(pl.col("factor"), pl.lit(other)).alias("factor") - ) - return Factor(result, f"({self._name}{op_sym}{other})") - - def _rbinary(self, other, op, op_sym) -> Factor: - result = self._df.with_columns( - op(pl.lit(other), pl.col("factor")).alias("factor") - ) - return Factor(result, f"({other}{op_sym}{self._name})") + self._check_panel(other) + name = f"({self._col}{op_sym}{other._col})" + expr = op(pl.col(self._col), pl.col(other._col)) + else: + name = f"({self._col}{op_sym}{other})" + expr = op(pl.col(self._col), pl.lit(other)) + self._panel._add_col(expr, name) + return Factor(name, self._panel) + + def _rbinary(self, other: int | float, op, op_sym: str) -> Factor: + name = f"({other}{op_sym}{self._col})" + expr = op(pl.lit(other), pl.col(self._col)) + self._panel._add_col(expr, name) + return Factor(name, self._panel) def __add__(self, other): return self._binary(other, lambda a, b: a + b, "+") @@ -105,30 +107,26 @@ def __rtruediv__(self, other): return self._rbinary(other, lambda a, b: a / b, "/") def __neg__(self): - return Factor( - self._df.with_columns((-pl.col("factor")).alias("factor")), - f"-{self._name}" - ) + name = f"-{self._col}" + self._panel._add_col(-pl.col(self._col), name) + return Factor(name, self._panel) def __abs__(self): - return Factor( - self._df.with_columns(pl.col("factor").abs().alias("factor")), - f"abs({self._name})" - ) + name = f"abs({self._col})" + self._panel._add_col(pl.col(self._col).abs(), name) + return Factor(name, self._panel) def __pow__(self, exp): if isinstance(exp, Factor): return self._binary(exp, lambda a, b: a.pow(b), "^") - return Factor( - self._df.with_columns(pl.col("factor").pow(exp).alias("factor")), - f"({self._name}^{exp})" - ) + name = f"({self._col}^{exp})" + self._panel._add_col(pl.col(self._col).pow(exp), name) + return Factor(name, self._panel) def __rpow__(self, base): - return Factor( - self._df.with_columns(pl.lit(base).pow(pl.col("factor")).alias("factor")), - f"({base}^{self._name})" - ) + name = f"({base}^{self._col})" + self._panel._add_col(pl.lit(base).pow(pl.col(self._col)), name) + return Factor(name, self._panel) def __gt__(self, other): return self._binary(other, lambda a, b: a > b, ">") @@ -142,8 +140,8 @@ def __lt__(self, other): def __le__(self, other): return self._binary(other, lambda a, b: a <= b, "<=") - def __eq__(self, other): + def __eq__(self, other) -> Factor: # type: ignore[override] return self._binary(other, lambda a, b: a == b, "==") - def __ne__(self, other): + def __ne__(self, other) -> Factor: # type: ignore[override] return self._binary(other, lambda a, b: a != b, "!=") diff --git a/elvers/core/panel.py b/elvers/core/panel.py new file mode 100644 index 0000000..8bfd8ac --- /dev/null +++ b/elvers/core/panel.py @@ -0,0 +1,75 @@ +"""Panel -- balanced eager container, single source of truth for all data.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl + +if TYPE_CHECKING: + from .factor import Factor + + +class Panel: + """Balanced panel container backed by a single pl.DataFrame. + + All Factor columns live in this DataFrame. Factor objects hold only + a column name string and a reference back to this Panel. + + Guarantees: sorted by [timestamp, symbol], every symbol has + identical timestamp coverage, missing values filled with null. + """ + + __slots__ = ("_df",) + + def __init__(self, df: pl.DataFrame): + self._df = df.sort(["timestamp", "symbol"]) + + @property + def df(self) -> pl.DataFrame: + return self._df + + def __getitem__(self, name: str) -> Factor: + from .factor import Factor + if name not in self._df.columns: + raise KeyError(f"Column '{name}' not in Panel. Available: {self._df.columns}") + return Factor(name, self) + + def _add_col(self, expr: pl.Expr, name: str) -> None: + """Add a computed column, or skip if already exists (memoization). + + Same name = same expression = same result. Skipping avoids + redundant computation when sub-expressions are reused. + """ + if name in self._df.columns: + return + self._df = self._df.with_columns(expr.alias(name)) + if self._df.schema[name] in (pl.Float32, pl.Float64): + col = pl.col(name) + self._df = self._df.with_columns( + pl.when(col.is_nan() | col.is_infinite()).then(None).otherwise(col).alias(name) + ) + + def select(self, *factors: Factor) -> pl.DataFrame: + """Export index + specified factor columns.""" + cols = ["timestamp", "symbol"] + [f._col for f in factors] + return self._df.select(cols) + + def gc(self, keep: list[str] | None = None) -> None: + """Drop intermediate columns. Keep index, original data columns, and specified names.""" + keep_set = {"timestamp", "symbol"} + if keep: + keep_set.update(keep) + drop = [c for c in self._df.columns if c not in keep_set] + if drop: + self._df = self._df.drop(drop) + + @property + def columns(self) -> list[str]: + return self._df.columns + + def __repr__(self) -> str: + n_sym = self._df["symbol"].n_unique() + n_day = self._df["timestamp"].n_unique() + cols = [c for c in self._df.columns if c not in ("timestamp", "symbol")] + return f"Panel({n_sym} symbols x {n_day} days, factors={cols})" diff --git a/elvers/data/__init__.py b/elvers/data/__init__.py index e69de29..84d26d0 100644 --- a/elvers/data/__init__.py +++ b/elvers/data/__init__.py @@ -0,0 +1,5 @@ +"""Data acquisition, storage, and panel loading.""" + +from .loader import load + +__all__ = ["load"] diff --git a/elvers/io/loader.py b/elvers/data/loader.py similarity index 62% rename from elvers/io/loader.py rename to elvers/data/loader.py index c6c91f9..c8252a8 100644 --- a/elvers/io/loader.py +++ b/elvers/data/loader.py @@ -8,13 +8,12 @@ import polars as pl -from .panel import Panel +from ..core.panel import Panel def load( source: str | Path | pl.DataFrame | None = None, format: str = "parquet", - interval: str = "1d", ) -> Panel: """Load data into a balanced Panel. @@ -22,28 +21,26 @@ def load( fully balanced (N symbols x T periods) long-format panel. Missing factor values are filled with null. + The time skeleton is the union of all timestamps present in the data. + Every symbol gets every timestamp; missing values are null. + Parameters ---------- source : str, Path, pl.DataFrame, or None Data source. None loads the built-in sample dataset. format : str File format for built-in data ("csv" or "parquet"). - interval : str - Time interval for the balanced panel skeleton (e.g., "1d", "1h", "5m"). - Must match the frequency of the input data. - - When called with no arguments, loads the built-in sample dataset - (daily crypto OHLCV) bundled with the package. """ if source is None: if format not in ["csv", "parquet"]: raise ValueError("format must be 'csv' or 'parquet'") - ref = resources.files("elvers.data").joinpath(f"crypto_1d.{format}") + ref = resources.files("elvers.data.sample").joinpath(f"crypto_1d.{format}") with resources.as_file(ref) as p: - return load(p, interval=interval) + return load(p) df = _read_source(source) df = _validate(df) - balanced = _balance(df, interval=interval) + _check_intervals(df) + balanced = _balance(df) return Panel(balanced) @@ -64,6 +61,13 @@ def _validate(df: pl.DataFrame) -> pl.DataFrame: if missing: raise ValueError(f"Missing required columns: {missing}") + ts_dtype = df.schema["timestamp"] + if ts_dtype not in (pl.Date, pl.Datetime, pl.Datetime("ms"), pl.Datetime("us"), pl.Datetime("ns")): + raise TypeError( + f"Column 'timestamp' must be pl.Date or pl.Datetime, got {ts_dtype}. " + f"Cast with pl.col('timestamp').str.to_date() or .str.to_datetime()." + ) + n_total = len(df) n_unique = df.select(pl.struct("timestamp", "symbol").n_unique())[0, 0] if n_unique < n_total: @@ -90,23 +94,36 @@ def _validate(df: pl.DataFrame) -> pl.DataFrame: return df -def _balance(df: pl.DataFrame, interval: str = "1d") -> pl.DataFrame: - """Expand to full date_range x symbols skeleton, fill missing with null.""" - start = df["timestamp"].min() - end = df["timestamp"].max() - symbols = sorted(df["symbol"].unique().to_list()) - ts_type = df.schema["timestamp"] +def _check_intervals(df: pl.DataFrame) -> None: + """Warn if timestamps are not equally spaced.""" + ts = df["timestamp"].unique().sort() + if len(ts) < 3: + return - skeleton = ( - pl.datetime_range(start=start, end=end, interval=interval, eager=True) - .alias("timestamp") - .to_frame() - ) + diffs = ts.diff().drop_nulls() + mode_diff = diffs.mode().sort()[0] + n_irregular = (diffs != mode_diff).sum() + + if n_irregular > 0: + pct = n_irregular / len(diffs) * 100 + warnings.warn( + f"Irregular timestamp intervals detected: {n_irregular} gaps " + f"({pct:.1f}%) differ from the most common interval ({mode_diff}). " + f"This may indicate missing data or mixed frequencies.", + stacklevel=3, + ) - if skeleton.schema["timestamp"] != ts_type: - skeleton = skeleton.with_columns(pl.col("timestamp").cast(ts_type)) - skeleton = skeleton.join(pl.DataFrame({"symbol": symbols}), how="cross") +def _balance(df: pl.DataFrame) -> pl.DataFrame: + """Expand to (all timestamps x all symbols), fill missing with null. + + The skeleton is the union of all timestamps present in the data, + not a generated range. This naturally handles weekends, holidays, + and irregular trading calendars. + """ + timestamps = df.select("timestamp").unique().sort("timestamp") + symbols = pl.DataFrame({"symbol": sorted(df["symbol"].unique().to_list())}) + skeleton = timestamps.join(symbols, how="cross") n_expected = len(skeleton) n_original = len(df) diff --git a/elvers/data/providers/__init__.py b/elvers/data/providers/__init__.py new file mode 100644 index 0000000..15b580a --- /dev/null +++ b/elvers/data/providers/__init__.py @@ -0,0 +1 @@ +"""Exchange and data source adapters.""" diff --git a/elvers/data/sample/__init__.py b/elvers/data/sample/__init__.py new file mode 100644 index 0000000..fdc4ce4 --- /dev/null +++ b/elvers/data/sample/__init__.py @@ -0,0 +1 @@ +"""Built-in sample datasets.""" diff --git a/elvers/data/crypto_1d.csv b/elvers/data/sample/crypto_1d.csv similarity index 100% rename from elvers/data/crypto_1d.csv rename to elvers/data/sample/crypto_1d.csv diff --git a/elvers/data/crypto_1d.parquet b/elvers/data/sample/crypto_1d.parquet similarity index 100% rename from elvers/data/crypto_1d.parquet rename to elvers/data/sample/crypto_1d.parquet diff --git a/elvers/execution/__init__.py b/elvers/execution/__init__.py new file mode 100644 index 0000000..12b7aeb --- /dev/null +++ b/elvers/execution/__init__.py @@ -0,0 +1 @@ +"""Trade execution, order management, and post-trade analysis.""" diff --git a/elvers/io/__init__.py b/elvers/io/__init__.py deleted file mode 100644 index 4d3b702..0000000 --- a/elvers/io/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .loader import load -from .panel import Panel - -__all__ = ["load", "Panel"] diff --git a/elvers/io/panel.py b/elvers/io/panel.py deleted file mode 100644 index ba8af40..0000000 --- a/elvers/io/panel.py +++ /dev/null @@ -1,40 +0,0 @@ -"""Panel -- balanced eager container.""" - -from __future__ import annotations - -from typing import TYPE_CHECKING - -import polars as pl - -if TYPE_CHECKING: - from ..core.factor import Factor - - -class Panel: - """Balanced panel container backed by pl.DataFrame. - - Guarantees: sorted by [timestamp, symbol], every symbol has - identical date coverage, missing values filled with null. - """ - - __slots__ = ("_df",) - - def __init__(self, df: pl.DataFrame): - self._df = df - - @property - def df(self) -> pl.DataFrame: - return self._df - - def __getitem__(self, name: str) -> Factor: - from ..core.factor import Factor - return Factor( - self._df.select("timestamp", "symbol", pl.col(name).alias("factor")), - name, - ) - - def __repr__(self) -> str: - n_sym = self._df["symbol"].n_unique() - n_day = self._df["timestamp"].n_unique() - cols = [c for c in self._df.columns if c not in ("timestamp", "symbol")] - return f"Panel({n_sym} symbols x {n_day} days, factors={cols})" diff --git a/elvers/monitor/__init__.py b/elvers/monitor/__init__.py new file mode 100644 index 0000000..cec1fca --- /dev/null +++ b/elvers/monitor/__init__.py @@ -0,0 +1 @@ +"""Real-time monitoring, alerts, and logging.""" diff --git a/elvers/ops/_dev.py b/elvers/ops/_dev.py index e5ed95f..8fbaad0 100644 --- a/elvers/ops/_dev.py +++ b/elvers/ops/_dev.py @@ -18,7 +18,7 @@ def ts_arg_max(f: Factor, window: int) -> Factor: Uses rolling_map (Python callback); pending native Polars support. """ - expr = pl.col("factor").rolling_map(lambda s: s.arg_max(), window_size=window, min_samples=window) + expr = pl.col(f._col).rolling_map(lambda s: s.arg_max(), window_size=window, min_samples=window) return _ts_op(f, expr, f"ts_arg_max({f.name},{window})") @@ -27,7 +27,7 @@ def ts_arg_min(f: Factor, window: int) -> Factor: Uses rolling_map (Python callback); pending native Polars support. """ - expr = pl.col("factor").rolling_map(lambda s: s.arg_min(), window_size=window, min_samples=window) + expr = pl.col(f._col).rolling_map(lambda s: s.arg_min(), window_size=window, min_samples=window) return _ts_op(f, expr, f"ts_arg_min({f.name},{window})") @@ -63,8 +63,11 @@ def _apply(s: pl.Series) -> pl.Series: prev = result[-1] return pl.Series(result) - df = f.df.sort(["symbol", "timestamp"]) - result = df.group_by("symbol", maintain_order=True).map_groups( - lambda group: group.with_columns(_apply(group["factor"]).alias("factor")) + name = f"hump({f.name},{hump_param})" + panel = f.panel + work = panel._df.select(["symbol", "timestamp", pl.col(f._col).alias("_hump_in")]).sort(["symbol", "timestamp"]) + result = work.group_by("symbol", maintain_order=True).map_groups( + lambda group: group.with_columns(_apply(group["_hump_in"]).alias("_hump_out")) ) - return Factor(result, f"hump({f.name},{hump_param})") + panel._df = panel._df.with_columns(result["_hump_out"].alias(name)) + return Factor(name, panel) diff --git a/elvers/ops/base.py b/elvers/ops/base.py index 54e8a90..540117c 100644 --- a/elvers/ops/base.py +++ b/elvers/ops/base.py @@ -14,8 +14,9 @@ def _fill_nulls(f: Factor, fill_value: float) -> Factor: """Replace nulls with fill_value.""" - df = f.df.with_columns(pl.col("factor").fill_null(fill_value)) - return Factor(df, f.name) + name = f"_fill({f.name},{fill_value})" + f.panel._add_col(pl.col(f._col).fill_null(fill_value), name) + return Factor(name, f.panel) def add(a: Factor, b: Factor | int | float, filter: bool = False) -> Factor: @@ -46,28 +47,7 @@ def multiply(a: Factor, b: Factor | int | float, filter: bool = False) -> Factor def divide(a: Factor, b: Factor | int | float) -> Factor: - """Element-wise division with zero-denominator protection. - - Returns null where abs(b) < 1e-10. - """ - if isinstance(b, Factor): - merged = a.df.rename({"factor": "_a"}).join( - b.df.select(["timestamp", "symbol", pl.col("factor").alias("_b")]), - on=["timestamp", "symbol"], - how="inner", - ) - result = merged.with_columns( - pl.when(pl.col("_b").abs() < 1e-10) - .then(None) - .otherwise(pl.col("_a") / pl.col("_b")) - .alias("factor") - ).select(["timestamp", "symbol", "factor"]) - return Factor(result, f"({a.name}/{b.name})") - if isinstance(b, (int, float)) and abs(b) < 1e-10: - return Factor( - a.df.with_columns(pl.lit(None).cast(pl.Float64).alias("factor")), - f"({a.name}/{b})", - ) + """Element-wise division. Division by zero produces Inf, converted to null by Panel.""" return a / b @@ -78,12 +58,13 @@ def reverse(f: Factor) -> Factor: def densify(f: Factor) -> Factor: """Remap values to consecutive integers 0..n-1 per timestamp.""" - col = pl.col("factor") + name = f"densify({f.name})" + col = pl.col(f._col) expr = pl.when(col.is_null()).then(None).otherwise( col.rank(method="dense").over("timestamp").cast(pl.Float64) - 1 ) - df = f.df.with_columns(expr.alias("factor")) - return Factor(df, f"densify({f.name})") + f.panel._add_col(expr, name) + return Factor(name, f.panel) def bucket(f: Factor, buckets: Sequence[float] | None = None, @@ -98,10 +79,11 @@ def bucket(f: Factor, buckets: Sequence[float] | None = None, else: raise ValueError("Provide buckets or range_params") - col = pl.col("factor") + name = f"bucket({f.name},{len(edges)})" + col = pl.col(f._col) expr = pl.lit(0.0) for edge in edges: expr = expr + (col > edge).cast(pl.Float64) result_expr = pl.when(col.is_null()).then(None).otherwise(expr) - df = f.df.with_columns(result_expr.alias("factor")) - return Factor(df, f"bucket({f.name})") + f.panel._add_col(result_expr, name) + return Factor(name, f.panel) diff --git a/elvers/ops/cross_sectional.py b/elvers/ops/cross_sectional.py index 2369654..928fe26 100644 --- a/elvers/ops/cross_sectional.py +++ b/elvers/ops/cross_sectional.py @@ -11,13 +11,13 @@ def _cs_op(f: Factor, expr: pl.Expr, name: str) -> Factor: """Apply a cross-sectional expression over timestamp groups.""" - df = f.df.with_columns(expr.over("timestamp").alias("factor")) - return Factor(df, name) + f.panel._add_col(expr.over("timestamp"), name) + return Factor(name, f.panel) def rank(f: Factor) -> Factor: """Cross-sectional percentile rank at each timestamp.""" - col = pl.col("factor") + col = pl.col(f._col) rank_expr = pl.when(col.is_null()).then(None).otherwise( col.rank(method="average") / col.count() ) @@ -26,10 +26,10 @@ def rank(f: Factor) -> Factor: def zscore(f: Factor) -> Factor: """Cross-sectional z-score: (x - mean) / std (population).""" - col = pl.col("factor") + col = pl.col(f._col) std = col.std(ddof=0) expr = pl.when(col.is_null()).then(None).when( - std < 1e-10 + std == 0 ).then(0.0).otherwise( (col - col.mean()) / std ) @@ -38,7 +38,7 @@ def zscore(f: Factor) -> Factor: def mean(f: Factor) -> Factor: """Cross-sectional mean (broadcast to all symbols).""" - col = pl.col("factor") + col = pl.col(f._col) return _cs_op( f, pl.when(col.is_null()).then(None).otherwise(col.mean()), @@ -48,7 +48,7 @@ def mean(f: Factor) -> Factor: def median(f: Factor) -> Factor: """Cross-sectional median (broadcast to all symbols).""" - col = pl.col("factor") + col = pl.col(f._col) return _cs_op( f, pl.when(col.is_null()).then(None).otherwise(col.median()), @@ -58,12 +58,12 @@ def median(f: Factor) -> Factor: def scale(f: Factor, target: float = 1.0, longscale: float = 0.0, shortscale: float = 0.0) -> Factor: """Scale abs sum to target, or scale long/short legs separately.""" - col = pl.col("factor") + col = pl.col(f._col) if longscale > 0 or shortscale > 0: long_sum = col.filter(col > 0).sum() short_sum = col.filter(col < 0).abs().sum() - long_factor = pl.when(long_sum < 1e-10).then(0.0).otherwise(pl.lit(longscale) / long_sum) - short_factor = pl.when(short_sum < 1e-10).then(0.0).otherwise(pl.lit(shortscale) / short_sum) + long_factor = pl.when(long_sum == 0).then(0.0).otherwise(pl.lit(longscale) / long_sum) + short_factor = pl.when(short_sum == 0).then(0.0).otherwise(pl.lit(shortscale) / short_sum) expr = pl.when(col.is_null()).then(None).when(col > 0).then( col * long_factor ).when(col < 0).then( @@ -72,7 +72,7 @@ def scale(f: Factor, target: float = 1.0, longscale: float = 0.0, shortscale: fl return _cs_op(f, expr, f"scale({f.name},long={longscale},short={shortscale})") abs_sum = col.abs().sum() expr = pl.when(col.is_null()).then(None).when( - abs_sum < 1e-10 + abs_sum == 0 ).then(0.0).otherwise( col / abs_sum * target ) @@ -81,11 +81,11 @@ def scale(f: Factor, target: float = 1.0, longscale: float = 0.0, shortscale: fl def normalize(f: Factor, use_std: bool = False, limit: float = 0.0) -> Factor: """Subtract cross-sectional mean, optionally divide by std and clip.""" - col = pl.col("factor") + col = pl.col(f._col) demeaned = col - col.mean() if use_std: std = col.std(ddof=0) - result = pl.when(col.is_null()).then(None).when(std < 1e-10).then(0.0).otherwise(demeaned / std) + result = pl.when(col.is_null()).then(None).when(std == 0).then(0.0).otherwise(demeaned / std) else: result = pl.when(col.is_null()).then(None).otherwise(demeaned) if limit > 0: @@ -133,7 +133,7 @@ def _rational_ppf(p: pl.Expr) -> pl.Expr: def quantile(f: Factor, driver: str = "gaussian", sigma: float = 1.0) -> Factor: """Transform ranks to target distribution via inverse CDF.""" - col = pl.col("factor") + col = pl.col(f._col) n = col.count() ranked = col.rank(method="average") / n shifted = pl.lit(1.0) / n + ranked * (pl.lit(1.0) - pl.lit(2.0) / n) @@ -156,13 +156,13 @@ def signal(f: Factor) -> Factor: Subtracts cross-sectional mean, then scales so sum(|w|) = 1. Nulls or invalid cross-sections result in 0.0 weights. """ - col = pl.col("factor") + col = pl.col(f._col) valid_count = col.is_not_null().sum() demeaned = col - col.mean() abs_sum = demeaned.abs().sum() expr = pl.when( - (valid_count < 2) | (abs_sum < 1e-10) | col.is_null() + (valid_count < 2) | (abs_sum == 0) | col.is_null() ).then(pl.lit(0.0)).otherwise( demeaned / abs_sum ) @@ -171,7 +171,7 @@ def signal(f: Factor) -> Factor: def winsorize(f: Factor, std: float = 4.0) -> Factor: """Clip to [mean - std*sigma, mean + std*sigma] cross-sectionally.""" - col = pl.col("factor") + col = pl.col(f._col) mu = col.mean() sigma = col.std(ddof=0) lower = mu - pl.lit(std) * sigma @@ -183,7 +183,7 @@ def winsorize(f: Factor, std: float = 4.0) -> Factor: def truncate(f: Factor, max_percent: float = 0.01) -> Factor: """Cap so abs(x) <= max_percent * sum(abs(all)) cross-sectionally.""" - col = pl.col("factor") + col = pl.col(f._col) abs_total = col.abs().sum() cap = abs_total * max_percent clamped = pl.max_horizontal(pl.min_horizontal(col, cap), -cap) @@ -193,13 +193,13 @@ def truncate(f: Factor, max_percent: float = 0.01) -> Factor: def left_tail(f: Factor, maximum: float = 0.0) -> Factor: """Null values greater than maximum.""" - col = pl.col("factor") + col = pl.col(f._col) expr = pl.when(col.is_null()).then(None).when(col > maximum).then(None).otherwise(col) return _cs_op(f, expr, f"left_tail({f.name},{maximum})") def right_tail(f: Factor, minimum: float = 0.0) -> Factor: """Null values less than minimum.""" - col = pl.col("factor") + col = pl.col(f._col) expr = pl.when(col.is_null()).then(None).when(col < minimum).then(None).otherwise(col) return _cs_op(f, expr, f"right_tail({f.name},{minimum})") diff --git a/elvers/ops/math.py b/elvers/ops/math.py index cefdc0b..96ce971 100644 --- a/elvers/ops/math.py +++ b/elvers/ops/math.py @@ -8,119 +8,103 @@ def _unary(f: Factor, expr: pl.Expr, name: str) -> Factor: - return Factor(f.df.with_columns(expr.alias("factor")), name) + f.panel._add_col(expr, name) + return Factor(name, f.panel) def log(f: Factor, base: float | None = None) -> Factor: """Logarithm with optional base (default: natural log).""" - expr = pl.col("factor").log(base) if base else pl.col("factor").log() + col = pl.col(f._col) + expr = col.log(base) if base else col.log() label = f"log({f.name},{base})" if base else f"log({f.name})" return _unary(f, expr, label) def sqrt(f: Factor) -> Factor: """Square root.""" - return _unary(f, pl.col("factor").sqrt(), f"sqrt({f.name})") + return _unary(f, pl.col(f._col).sqrt(), f"sqrt({f.name})") def sign(f: Factor) -> Factor: """Sign of values: -1, 0, or +1.""" - return _unary(f, pl.col("factor").sign(), f"sign({f.name})") + return _unary(f, pl.col(f._col).sign(), f"sign({f.name})") def power(base: Factor, exp: Factor | int | float) -> Factor: """Element-wise power.""" if isinstance(exp, Factor): return base._binary(exp, lambda a, b: a.pow(b), "^") - return _unary(base, pl.col("factor").pow(exp), f"power({base.name},{exp})") + return _unary(base, pl.col(base._col).pow(exp), f"power({base.name},{exp})") def signed_power(base: Factor, exp: Factor | int | float) -> Factor: """Sign-preserving power: sign(x) * |x|^exp.""" - col = pl.col("factor") + col = pl.col(base._col) if isinstance(exp, Factor): - merged = base.df.rename({"factor": "_b"}).join( - exp.df.select(["timestamp", "symbol", pl.col("factor").alias("_e")]), - on=["timestamp", "symbol"], how="inner" - ) - result = merged.with_columns( - (pl.col("_b").sign() * pl.col("_b").abs().pow(pl.col("_e"))).alias("factor") - ).select(["timestamp", "symbol", "factor"]) - return Factor(result, f"signed_power({base.name},{exp.name})") + base._check_panel(exp) + name = f"signed_power({base.name},{exp.name})" + expr = col.sign() * col.abs().pow(pl.col(exp._col)) + base.panel._add_col(expr, name) + return Factor(name, base.panel) return _unary(base, col.sign() * col.abs().pow(exp), f"signed_power({base.name},{exp})") def inverse(f: Factor) -> Factor: - """Reciprocal (1/x). Returns null where abs(x) < 1e-10.""" - col = pl.col("factor") - expr = pl.when(col.abs() < 1e-10).then(None).otherwise(pl.lit(1.0) / col) - return _unary(f, expr, f"inverse({f.name})") + """Reciprocal (1/x). Division by zero produces Inf, converted to null by Panel.""" + return _unary(f, pl.lit(1.0) / pl.col(f._col), f"inverse({f.name})") def s_log_1p(f: Factor) -> Factor: """Sign-preserving log: sign(x) * ln(1 + |x|).""" - col = pl.col("factor") + col = pl.col(f._col) return _unary(f, col.sign() * (1 + col.abs()).log(), f"s_log_1p({f.name})") def maximum(a: Factor, b: Factor | int | float) -> Factor: """Element-wise maximum.""" if isinstance(b, Factor): - merged = a.df.rename({"factor": "_a"}).join( - b.df.select(["timestamp", "symbol", pl.col("factor").alias("_b")]), - on=["timestamp", "symbol"], how="inner" - ) - result = merged.with_columns( - pl.max_horizontal(pl.col("_a"), pl.col("_b")).alias("factor") - ).select(["timestamp", "symbol", "factor"]) - return Factor(result, f"maximum({a.name},{b.name})") - return _unary(a, pl.max_horizontal(pl.col("factor"), pl.lit(b)), f"maximum({a.name},{b})") + a._check_panel(b) + name = f"maximum({a.name},{b.name})" + a.panel._add_col(pl.max_horizontal(pl.col(a._col), pl.col(b._col)), name) + return Factor(name, a.panel) + name = f"maximum({a.name},{b})" + a.panel._add_col(pl.max_horizontal(pl.col(a._col), pl.lit(b)), name) + return Factor(name, a.panel) def minimum(a: Factor, b: Factor | int | float) -> Factor: """Element-wise minimum.""" if isinstance(b, Factor): - merged = a.df.rename({"factor": "_a"}).join( - b.df.select(["timestamp", "symbol", pl.col("factor").alias("_b")]), - on=["timestamp", "symbol"], how="inner" - ) - result = merged.with_columns( - pl.min_horizontal(pl.col("_a"), pl.col("_b")).alias("factor") - ).select(["timestamp", "symbol", "factor"]) - return Factor(result, f"minimum({a.name},{b.name})") - return _unary(a, pl.min_horizontal(pl.col("factor"), pl.lit(b)), f"minimum({a.name},{b})") + a._check_panel(b) + name = f"minimum({a.name},{b.name})" + a.panel._add_col(pl.min_horizontal(pl.col(a._col), pl.col(b._col)), name) + return Factor(name, a.panel) + name = f"minimum({a.name},{b})" + a.panel._add_col(pl.min_horizontal(pl.col(a._col), pl.lit(b)), name) + return Factor(name, a.panel) def where(condition: Factor, x: Factor | int | float, y: Factor | int | float) -> Factor: """Conditional selection: x if condition else y.""" - base = condition.df.rename({"factor": "_cond"}) + cond_col = pl.col(condition._col) if isinstance(x, Factor): - base = base.join( - x.df.select(["timestamp", "symbol", pl.col("factor").alias("_x")]), - on=["timestamp", "symbol"], how="inner" - ) - x_col = pl.col("_x") + condition._check_panel(x) + x_col = pl.col(x._col) x_name = x.name else: x_col = pl.lit(x) x_name = str(x) if isinstance(y, Factor): - base = base.join( - y.df.select(["timestamp", "symbol", pl.col("factor").alias("_y")]), - on=["timestamp", "symbol"], how="inner" - ) - y_col = pl.col("_y") + condition._check_panel(y) + y_col = pl.col(y._col) y_name = y.name else: y_col = pl.lit(y) y_name = str(y) - cond = pl.col("_cond") - result = base.with_columns( - pl.when(cond.is_null()).then(None).when( - cond.cast(bool) - ).then(x_col).otherwise(y_col).alias("factor") - ).select(["timestamp", "symbol", "factor"]) - return Factor(result, f"where({condition.name},{x_name},{y_name})") + name = f"where({condition.name},{x_name},{y_name})" + expr = pl.when(cond_col.is_null()).then(None).when(cond_col.cast(bool)).then(x_col).otherwise(y_col) + condition.panel._add_col(expr, name) + return Factor(name, condition.panel) diff --git a/elvers/ops/neutralization.py b/elvers/ops/neutralization.py index fa6bb14..7e3eab5 100644 --- a/elvers/ops/neutralization.py +++ b/elvers/ops/neutralization.py @@ -9,169 +9,151 @@ def vector_neut(x: Factor, y: Factor) -> Factor: """Remove y's influence from x using vector projection.""" - merged = x.df.rename({"factor": "_x"}).join( - y.df.select(["timestamp", "symbol", pl.col("factor").alias("_y")]), - on=["timestamp", "symbol"], how="inner" - ) - result = merged.with_columns([ - (pl.col("_x") * pl.col("_y")).sum().over("timestamp").alias("_xy"), - (pl.col("_y") * pl.col("_y")).sum().over("timestamp").alias("_yy"), - ]).with_columns( - pl.when(pl.col("_yy").abs() < 1e-10).then(pl.col("_x")).otherwise( - pl.col("_x") - (pl.col("_xy") / pl.col("_yy")) * pl.col("_y") - ).alias("factor") - ).select(["timestamp", "symbol", "factor"]) - return Factor(result, f"vector_neut({x.name},{y.name})") + x._check_panel(y) + name = f"vector_neut({x.name},{y.name})" + xc, yc = pl.col(x._col), pl.col(y._col) + xy = (xc * yc).sum().over("timestamp") + yy = (yc * yc).sum().over("timestamp") + expr = pl.when(yy.abs() == 0).then(xc).otherwise(xc - (xy / yy) * yc) + x.panel._add_col(expr, name) + return Factor(name, x.panel) def regression_neut(y: Factor, x: Factor) -> Factor: """Remove x's influence from y using linear regression residuals.""" - merged = y.df.rename({"factor": "_y"}).join( - x.df.select(["timestamp", "symbol", pl.col("factor").alias("_x")]), - on=["timestamp", "symbol"], how="inner" - ) - result = merged.with_columns([ - pl.col("_x").count().over("timestamp").alias("_n"), - pl.col("_x").sum().over("timestamp").alias("_sx"), - pl.col("_y").sum().over("timestamp").alias("_sy"), - (pl.col("_x") * pl.col("_y")).sum().over("timestamp").alias("_sxy"), - (pl.col("_x") * pl.col("_x")).sum().over("timestamp").alias("_sxx"), - ]).with_columns( - (pl.col("_n") * pl.col("_sxx") - pl.col("_sx") * pl.col("_sx")).alias("_denom"), - ).with_columns( - pl.when(pl.col("_denom").abs() < 1e-10).then(0.0).otherwise( - (pl.col("_n") * pl.col("_sxy") - pl.col("_sx") * pl.col("_sy")) / pl.col("_denom") - ).alias("_beta"), - ).with_columns( - ((pl.col("_sy") - pl.col("_beta") * pl.col("_sx")) / pl.col("_n")).alias("_alpha"), - ).with_columns( - (pl.col("_y") - pl.col("_alpha") - pl.col("_beta") * pl.col("_x")).alias("factor") - ).select(["timestamp", "symbol", "factor"]) - return Factor(result, f"regression_neut({y.name},{x.name})") + y._check_panel(x) + panel = y.panel + name = f"regression_neut({y.name},{x.name})" + uid = f"_rn_{id(y)}_{id(x)}" + xc, yc = pl.col(x._col), pl.col(y._col) + + panel._add_col(xc.count().over("timestamp"), f"{uid}_n") + panel._add_col(xc.sum().over("timestamp"), f"{uid}_sx") + panel._add_col(yc.sum().over("timestamp"), f"{uid}_sy") + panel._add_col((xc * yc).sum().over("timestamp"), f"{uid}_sxy") + panel._add_col((xc * xc).sum().over("timestamp"), f"{uid}_sxx") + + n, sx, sy = pl.col(f"{uid}_n"), pl.col(f"{uid}_sx"), pl.col(f"{uid}_sy") + sxy, sxx = pl.col(f"{uid}_sxy"), pl.col(f"{uid}_sxx") + + denom = n * sxx - sx * sx + panel._add_col(denom, f"{uid}_denom") + d = pl.col(f"{uid}_denom") + + beta = pl.when(d.abs() == 0).then(0.0).otherwise((n * sxy - sx * sy) / d) + panel._add_col(beta, f"{uid}_beta") + b = pl.col(f"{uid}_beta") + + alpha = (sy - b * sx) / n + panel._add_col(alpha, f"{uid}_alpha") + a = pl.col(f"{uid}_alpha") + + expr = yc - a - b * xc + panel._add_col(expr, name) + return Factor(name, panel) def group_neutralize(f: Factor, group: Factor) -> Factor: """Subtract group mean at each timestamp.""" - merged = f.df.join( - group.df.select(["timestamp", "symbol", pl.col("factor").alias("_group")]), - on=["timestamp", "symbol"], how="inner" - ) - result = merged.with_columns( - (pl.col("factor") - pl.col("factor").mean().over(["timestamp", "_group"])).alias("factor") - ).select(["timestamp", "symbol", "factor"]) - return Factor(result, f"group_neutralize({f.name},{group.name})") + f._check_panel(group) + name = f"group_neutralize({f.name},{group.name})" + col = pl.col(f._col) + expr = (col - col.mean().over(["timestamp", group._col])) + f.panel._add_col(expr, name) + return Factor(name, f.panel) def group_rank(f: Factor, group: Factor) -> Factor: """Percentile rank within each group at each timestamp.""" - merged = f.df.join( - group.df.select(["timestamp", "symbol", pl.col("factor").alias("_group")]), - on=["timestamp", "symbol"], how="inner" - ) - result = merged.with_columns( - (pl.col("factor").rank(method="average") / pl.col("factor").count()) - .over(["timestamp", "_group"]).alias("factor") - ).select(["timestamp", "symbol", "factor"]) - return Factor(result, f"group_rank({f.name},{group.name})") + f._check_panel(group) + name = f"group_rank({f.name},{group.name})" + col = pl.col(f._col) + expr = (col.rank(method="average") / col.count()).over(["timestamp", group._col]) + f.panel._add_col(expr, name) + return Factor(name, f.panel) def group_zscore(f: Factor, group: Factor) -> Factor: """Z-score within each group at each timestamp (population std, ddof=0).""" - merged = f.df.join( - group.df.select(["timestamp", "symbol", pl.col("factor").alias("_group")]), - on=["timestamp", "symbol"], how="inner" + f._check_panel(group) + name = f"group_zscore({f.name},{group.name})" + col = pl.col(f._col) + grp_std = col.std(ddof=0).over(["timestamp", group._col]) + expr = pl.when(grp_std == 0).then(0.0).otherwise( + (col - col.mean().over(["timestamp", group._col])) / grp_std ) - col = pl.col("factor") - grp_std = col.std(ddof=0).over(["timestamp", "_group"]) - result = merged.with_columns( - pl.when(grp_std < 1e-10).then(0.0).otherwise( - (col - col.mean().over(["timestamp", "_group"])) / grp_std - ).alias("factor") - ).select(["timestamp", "symbol", "factor"]) - return Factor(result, f"group_zscore({f.name},{group.name})") + f.panel._add_col(expr, name) + return Factor(name, f.panel) def group_scale(f: Factor, group: Factor) -> Factor: """Scale to 0-1 within each group at each timestamp.""" - merged = f.df.join( - group.df.select(["timestamp", "symbol", pl.col("factor").alias("_group")]), - on=["timestamp", "symbol"], how="inner" - ) - col = pl.col("factor") - mn = col.min().over(["timestamp", "_group"]) - mx = col.max().over(["timestamp", "_group"]) + f._check_panel(group) + name = f"group_scale({f.name},{group.name})" + col = pl.col(f._col) + mn = col.min().over(["timestamp", group._col]) + mx = col.max().over(["timestamp", group._col]) rng = mx - mn - result = merged.with_columns( - pl.when(rng < 1e-10).then(0.0).otherwise((col - mn) / rng).alias("factor") - ).select(["timestamp", "symbol", "factor"]) - return Factor(result, f"group_scale({f.name},{group.name})") + expr = pl.when(rng == 0).then(0.0).otherwise((col - mn) / rng) + f.panel._add_col(expr, name) + return Factor(name, f.panel) def group_normalize(f: Factor, group: Factor, target: float = 1.0) -> Factor: """Normalize so abs sum within each group equals target.""" - merged = f.df.join( - group.df.select(["timestamp", "symbol", pl.col("factor").alias("_group")]), - on=["timestamp", "symbol"], how="inner" - ) - col = pl.col("factor") - grp_abs_sum = col.abs().sum().over(["timestamp", "_group"]) - result = merged.with_columns( - pl.when(grp_abs_sum < 1e-10).then(0.0).otherwise(col / grp_abs_sum * target).alias("factor") - ).select(["timestamp", "symbol", "factor"]) - return Factor(result, f"group_normalize({f.name},{group.name},{target})") + f._check_panel(group) + name = f"group_normalize({f.name},{group.name},{target})" + col = pl.col(f._col) + grp_abs_sum = col.abs().sum().over(["timestamp", group._col]) + expr = pl.when(grp_abs_sum == 0).then(0.0).otherwise(col / grp_abs_sum * target) + f.panel._add_col(expr, name) + return Factor(name, f.panel) def group_mean(f: Factor, group: Factor, weight: Factor | None = None) -> Factor: """Broadcast group mean to all members (optionally weighted).""" - merged = f.df.join( - group.df.select(["timestamp", "symbol", pl.col("factor").alias("_group")]), - on=["timestamp", "symbol"], how="inner" - ) + f._check_panel(group) + col = pl.col(f._col) + if weight is not None: - merged = merged.join( - weight.df.select(["timestamp", "symbol", pl.col("factor").alias("_w")]), - on=["timestamp", "symbol"], how="inner" + f._check_panel(weight) + wc = pl.col(weight._col) + w_sum = wc.sum().over(["timestamp", group._col]) + expr = pl.when(w_sum.abs() == 0).then(col.mean().over(["timestamp", group._col])).otherwise( + (col * wc).sum().over(["timestamp", group._col]) / w_sum ) - w_sum = pl.col("_w").sum().over(["timestamp", "_group"]) - result = merged.with_columns( - pl.when(w_sum.abs() < 1e-10).then(pl.col("factor").mean().over(["timestamp", "_group"])).otherwise( - (pl.col("factor") * pl.col("_w")).sum().over(["timestamp", "_group"]) / w_sum - ).alias("factor") - ).select(["timestamp", "symbol", "factor"]) + name_suffix = f",{weight.name}" else: - result = merged.with_columns( - pl.col("factor").mean().over(["timestamp", "_group"]).alias("factor") - ).select(["timestamp", "symbol", "factor"]) - name_suffix = f",{weight.name}" if weight else "" - return Factor(result, f"group_mean({f.name},{group.name}{name_suffix})") + expr = col.mean().over(["timestamp", group._col]) + name_suffix = "" + + name = f"group_mean({f.name},{group.name}{name_suffix})" + f.panel._add_col(expr, name) + return Factor(name, f.panel) def group_median(f: Factor, group: Factor) -> Factor: """Broadcast group median to all members at each timestamp.""" - merged = f.df.join( - group.df.select(["timestamp", "symbol", pl.col("factor").alias("_group")]), - on=["timestamp", "symbol"], how="inner" - ) - result = merged.with_columns( - pl.col("factor").median().over(["timestamp", "_group"]).alias("factor") - ).select(["timestamp", "symbol", "factor"]) - return Factor(result, f"group_median({f.name},{group.name})") + f._check_panel(group) + name = f"group_median({f.name},{group.name})" + col = pl.col(f._col) + expr = col.median().over(["timestamp", group._col]) + f.panel._add_col(expr, name) + return Factor(name, f.panel) def group_backfill(f: Factor, group: Factor, std: float = 4.0) -> Factor: """Fill null with winsorized mean of same-group instruments at same timestamp.""" - merged = f.df.join( - group.df.select(["timestamp", "symbol", pl.col("factor").alias("_group")]), - on=["timestamp", "symbol"], how="inner" - ) - col = pl.col("factor") - grp_mean = col.mean().over(["timestamp", "_group"]) - grp_std = col.std(ddof=0).over(["timestamp", "_group"]) + f._check_panel(group) + name = f"group_backfill({f.name},{group.name},{std})" + col = pl.col(f._col) + grp_key = ["timestamp", group._col] + grp_mean = col.mean().over(grp_key) + grp_std = col.std(ddof=0).over(grp_key) lower = grp_mean - pl.lit(std) * grp_std upper = grp_mean + pl.lit(std) * grp_std winsorized = col.clip(lower, upper) - fill_val = winsorized.mean().over(["timestamp", "_group"]) - result = merged.with_columns( - pl.when(col.is_null()).then(fill_val).otherwise(col).alias("factor") - ).select(["timestamp", "symbol", "factor"]) - return Factor(result, f"group_backfill({f.name},{group.name},{std})") + fill_val = winsorized.mean().over(grp_key) + expr = pl.when(col.is_null()).then(fill_val).otherwise(col) + f.panel._add_col(expr, name) + return Factor(name, f.panel) diff --git a/elvers/ops/timeseries.py b/elvers/ops/timeseries.py index 9822ae9..b66228f 100644 --- a/elvers/ops/timeseries.py +++ b/elvers/ops/timeseries.py @@ -11,103 +11,98 @@ def _ts_op(f: Factor, expr: pl.Expr, name: str) -> Factor: """Apply a time-series expression over symbol groups.""" - df = f.df.with_columns(expr.over("symbol").alias("factor")) - return Factor(df, name) + f.panel._add_col(expr.over("symbol"), name) + return Factor(name, f.panel) def ts_delay(f: Factor, window: int) -> Factor: """Value from N periods ago.""" - return _ts_op(f, pl.col("factor").shift(window), f"ts_delay({f.name},{window})") + return _ts_op(f, pl.col(f._col).shift(window), f"ts_delay({f.name},{window})") def ts_delta(f: Factor, window: int) -> Factor: """Change since N periods ago: current - lagged.""" - return _ts_op(f, pl.col("factor") - pl.col("factor").shift(window), f"ts_delta({f.name},{window})") + col = pl.col(f._col) + return _ts_op(f, col - col.shift(window), f"ts_delta({f.name},{window})") def ts_mean(f: Factor, window: int) -> Factor: """Rolling mean over N periods.""" - return _ts_op(f, pl.col("factor").rolling_mean(window, min_samples=window), f"ts_mean({f.name},{window})") + return _ts_op(f, pl.col(f._col).rolling_mean(window, min_samples=window), f"ts_mean({f.name},{window})") def ts_sum(f: Factor, window: int) -> Factor: """Rolling sum over N periods.""" - return _ts_op(f, pl.col("factor").rolling_sum(window, min_samples=window), f"ts_sum({f.name},{window})") + return _ts_op(f, pl.col(f._col).rolling_sum(window, min_samples=window), f"ts_sum({f.name},{window})") def ts_std_dev(f: Factor, window: int) -> Factor: """Rolling population standard deviation over N periods.""" - return _ts_op(f, pl.col("factor").rolling_std(window, min_samples=window, ddof=0), f"ts_std_dev({f.name},{window})") + return _ts_op(f, pl.col(f._col).rolling_std(window, min_samples=window, ddof=0), f"ts_std_dev({f.name},{window})") def ts_min(f: Factor, window: int) -> Factor: """Rolling minimum over N periods.""" - return _ts_op(f, pl.col("factor").rolling_min(window, min_samples=window), f"ts_min({f.name},{window})") + return _ts_op(f, pl.col(f._col).rolling_min(window, min_samples=window), f"ts_min({f.name},{window})") def ts_max(f: Factor, window: int) -> Factor: """Rolling maximum over N periods.""" - return _ts_op(f, pl.col("factor").rolling_max(window, min_samples=window), f"ts_max({f.name},{window})") + return _ts_op(f, pl.col(f._col).rolling_max(window, min_samples=window), f"ts_max({f.name},{window})") def ts_median(f: Factor, window: int) -> Factor: """Rolling median over N periods.""" - return _ts_op(f, pl.col("factor").rolling_median(window, min_samples=window), f"ts_median({f.name},{window})") + return _ts_op(f, pl.col(f._col).rolling_median(window, min_samples=window), f"ts_median({f.name},{window})") def ts_rank(f: Factor, window: int, constant: float = 0) -> Factor: """Percentile rank of current value within rolling window.""" - expr = pl.col("factor").rolling_rank(window_size=window, method="average", min_samples=window).cast(pl.Float64) / window + constant + expr = pl.col(f._col).rolling_rank(window_size=window, method="average", min_samples=window).cast(pl.Float64) / window + constant return _ts_op(f, expr, f"ts_rank({f.name},{window})") def ts_skewness(f: Factor, window: int) -> Factor: """Rolling skewness (population) over N periods.""" - return _ts_op( - f, - pl.col("factor").rolling_skew(window, bias=True, min_samples=window), - f"ts_skewness({f.name},{window})", - ) + return _ts_op(f, pl.col(f._col).rolling_skew(window, bias=True, min_samples=window), f"ts_skewness({f.name},{window})") def ts_kurtosis(f: Factor, window: int) -> Factor: """Rolling kurtosis over N periods (excess kurtosis, Fisher definition).""" - return _ts_op(f, pl.col("factor").rolling_kurtosis(window, min_samples=window), f"ts_kurtosis({f.name},{window})") + return _ts_op(f, pl.col(f._col).rolling_kurtosis(window, min_samples=window), f"ts_kurtosis({f.name},{window})") def ts_zscore(f: Factor, window: int) -> Factor: """(x - rolling_mean) / rolling_std (population).""" - col = pl.col("factor") + col = pl.col(f._col) mean = col.rolling_mean(window, min_samples=window) std = col.rolling_std(window, min_samples=window, ddof=0) - expr = pl.when(std < 1e-10).then(0.0).otherwise((col - mean) / std) + expr = pl.when(std == 0).then(0.0).otherwise((col - mean) / std) return _ts_op(f, expr, f"ts_zscore({f.name},{window})") def ts_corr(a: Factor, b: Factor, window: int) -> Factor: """Rolling Pearson correlation between two factors over N periods.""" - merged = a.df.rename({"factor": "_a"}).join( - b.df.select(["timestamp", "symbol", pl.col("factor").alias("_b")]), - on=["timestamp", "symbol"], how="inner" - ).sort(["symbol", "timestamp"]) - result = merged.with_columns( - pl.rolling_corr(pl.col("_a"), pl.col("_b"), window_size=window, min_samples=window, ddof=1) - .over("symbol").alias("factor") - ).select(["timestamp", "symbol", "factor"]) - return Factor(result, f"ts_corr({a.name},{b.name},{window})") + a._check_panel(b) + name = f"ts_corr({a.name},{b.name},{window})" + expr = pl.rolling_corr( + pl.col(a._col), pl.col(b._col), + window_size=window, min_samples=window, ddof=1 + ).over("symbol") + a.panel._add_col(expr, name) + return Factor(name, a.panel) def ts_covariance(a: Factor, b: Factor, window: int) -> Factor: """Rolling population covariance between two factors over N periods (ddof=0).""" - merged = a.df.rename({"factor": "_a"}).join( - b.df.select(["timestamp", "symbol", pl.col("factor").alias("_b")]), - on=["timestamp", "symbol"], how="inner" - ).sort(["symbol", "timestamp"]) - result = merged.with_columns( - pl.rolling_cov(pl.col("_a"), pl.col("_b"), window_size=window, min_samples=window, ddof=0) - .over("symbol").alias("factor") - ).select(["timestamp", "symbol", "factor"]) - return Factor(result, f"ts_covariance({a.name},{b.name},{window})") + a._check_panel(b) + name = f"ts_covariance({a.name},{b.name},{window})" + expr = pl.rolling_cov( + pl.col(a._col), pl.col(b._col), + window_size=window, min_samples=window, ddof=0 + ).over("symbol") + a.panel._add_col(expr, name) + return Factor(name, a.panel) def ts_product(f: Factor, window: int) -> Factor: @@ -115,9 +110,8 @@ def ts_product(f: Factor, window: int) -> Factor: Handles negative values by separating sign and magnitude. If any value in the window is zero, the product is zero. - Returns null for the first (window-1) periods per symbol. """ - col = pl.col("factor") + col = pl.col(f._col) has_zero = (col == 0).cast(pl.Int32).rolling_sum(window, min_samples=window) > 0 neg_count = (col < 0).cast(pl.Int32).rolling_sum(window, min_samples=window) sign = pl.when(neg_count % 2 == 1).then(pl.lit(-1.0)).otherwise(pl.lit(1.0)) @@ -137,14 +131,14 @@ def ts_decay_exp_window(f: Factor, window: int, factor: float = 1.0) -> Factor: weights = [factor ** (window - 1 - i) for i in range(window)] return _ts_op( f, - pl.col("factor").rolling_mean(window, weights=weights, min_samples=window), + pl.col(f._col).rolling_mean(window, weights=weights, min_samples=window), f"ts_decay_exp_window({f.name},{window},{factor})", ) def days_from_last_change(f: Factor) -> Factor: """Number of days since value last changed (0 on change day).""" - col = pl.col("factor") + col = pl.col(f._col) changed = (col != col.shift(1)).fill_null(True) counter = col.cum_count() last_change = pl.when(changed).then(counter).otherwise(None).forward_fill() @@ -154,7 +148,7 @@ def days_from_last_change(f: Factor) -> Factor: def kth_element(f: Factor, window: int, k: int = 1, ignore: str = "NaN") -> Factor: """Return k-th valid value looking back d days. ignore: 'NaN', 'NaN 0'.""" - col = pl.col("factor") + col = pl.col(f._col) ignore_zero = "0" in ignore accum = pl.lit(None).cast(pl.Float64) count = pl.lit(0) @@ -162,7 +156,7 @@ def kth_element(f: Factor, window: int, k: int = 1, ignore: str = "NaN") -> Fact val = col.shift(i) is_valid = val.is_not_null() if ignore_zero: - is_valid = is_valid & (val.abs() > 1e-10) + is_valid = is_valid & (val != 0) new_count = count + is_valid.cast(pl.Int32) accum = pl.when((new_count >= k) & (count < k)).then(val).otherwise(accum) count = new_count @@ -171,7 +165,7 @@ def kth_element(f: Factor, window: int, k: int = 1, ignore: str = "NaN") -> Fact def last_diff_value(f: Factor, window: int) -> Factor: """Most recent value in past d days that differs from current.""" - col = pl.col("factor") + col = pl.col(f._col) accum = pl.lit(None).cast(pl.Float64) for i in range(1, window + 1): val = col.shift(i) @@ -182,27 +176,31 @@ def last_diff_value(f: Factor, window: int) -> Factor: def trade_when(trigger: Factor, alpha: Factor, exit_cond: Factor) -> Factor: """Hold alpha when trigger fires, null on exit, carry forward otherwise.""" - merged = trigger.df.rename({"factor": "_trigger"}).join( - alpha.df.select(["timestamp", "symbol", pl.col("factor").alias("_alpha")]), - on=["timestamp", "symbol"], how="inner" - ).join( - exit_cond.df.select(["timestamp", "symbol", pl.col("factor").alias("_exit")]), - on=["timestamp", "symbol"], how="inner" - ).sort(["symbol", "timestamp"]) + trigger._check_panel(alpha) + trigger._check_panel(exit_cond) + panel = trigger.panel + name = f"trade_when({trigger.name},{alpha.name},{exit_cond.name})" + _SENTINEL = -1.7976931348623157e+308 - is_exit = pl.col("_exit") > 0 - is_trade = pl.col("_trigger") > 0 + is_exit = pl.col(exit_cond._col) > 0 + is_trade = pl.col(trigger._col) > 0 action_val = pl.when(is_exit).then(pl.lit(_SENTINEL)).when(is_trade).then( - pl.col("_alpha") + pl.col(alpha._col) ).otherwise(None) - result = merged.with_columns( - action_val.forward_fill().over("symbol").alias("_filled") - ).with_columns( - pl.when(pl.col("_filled") == _SENTINEL).then(None).when( - pl.col("_filled").is_null() - ).then(None).otherwise(pl.col("_filled")).alias("factor") - ).select(["timestamp", "symbol", "factor"]) - return Factor(result, f"trade_when({trigger.name},{alpha.name},{exit_cond.name})") + + _tmp_action = f"_tw_action_{id(trigger)}" + _tmp_filled = f"_tw_filled_{id(trigger)}" + + panel._add_col(action_val, _tmp_action) + panel._add_col(pl.col(_tmp_action).forward_fill().over("symbol"), _tmp_filled) + + expr = ( + pl.when(pl.col(_tmp_filled) == _SENTINEL).then(None) + .when(pl.col(_tmp_filled).is_null()).then(None) + .otherwise(pl.col(_tmp_filled)) + ) + panel._add_col(expr, name) + return Factor(name, panel) def ts_decay_linear(f: Factor, window: int) -> Factor: @@ -210,24 +208,24 @@ def ts_decay_linear(f: Factor, window: int) -> Factor: weights = [float(i) for i in range(1, window + 1)] return _ts_op( f, - pl.col("factor").rolling_mean(window, weights=weights, min_samples=window), + pl.col(f._col).rolling_mean(window, weights=weights, min_samples=window), f"ts_decay_linear({f.name},{window})", ) def ts_av_diff(f: Factor, window: int) -> Factor: """Distance from rolling average: current - rolling_mean.""" - col = pl.col("factor") + col = pl.col(f._col) return _ts_op(f, col - col.rolling_mean(window, min_samples=window), f"ts_av_diff({f.name},{window})") def ts_scale(f: Factor, window: int, constant: float = 0) -> Factor: """Scale to 0-1 range based on rolling min/max.""" - col = pl.col("factor") + col = pl.col(f._col) mn = col.rolling_min(window, min_samples=window) mx = col.rolling_max(window, min_samples=window) rng = mx - mn - expr = pl.when(rng < 1e-10).then(0.0).otherwise((col - mn) / rng) + constant + expr = pl.when(rng == 0).then(0.0).otherwise((col - mn) / rng) + constant return _ts_op(f, expr, f"ts_scale({f.name},{window},{constant})") @@ -235,14 +233,14 @@ def ts_percentile(f: Factor, window: int, q: float) -> Factor: """Rolling quantile value at specified percentile.""" return _ts_op( f, - pl.col("factor").rolling_quantile(quantile=q, window_size=window, min_samples=window), + pl.col(f._col).rolling_quantile(quantile=q, window_size=window, min_samples=window), f"ts_percentile({f.name},{window},{q})", ) def ts_quantile(f: Factor, window: int, driver: str = "gaussian") -> Factor: """Rank within rolling window, shift, then apply inverse CDF transform.""" - col = pl.col("factor") + col = pl.col(f._col) ranked = col.rolling_rank(window_size=window, method="average", min_samples=window).cast(pl.Float64) / window n = pl.lit(float(window)) shifted = pl.lit(1.0) / n + ranked * (pl.lit(1.0) - pl.lit(2.0) / n) @@ -261,33 +259,30 @@ def ts_quantile(f: Factor, window: int, driver: str = "gaussian") -> Factor: def ts_cv(f: Factor, window: int) -> Factor: """Coefficient of variation: std / |mean| over rolling window.""" - col = pl.col("factor") + col = pl.col(f._col) std = col.rolling_std(window, min_samples=window, ddof=0) abs_mean = col.rolling_mean(window, min_samples=window).abs() - expr = pl.when(abs_mean < 1e-10).then(None).otherwise(std / abs_mean) + expr = pl.when(abs_mean == 0).then(None).otherwise(std / abs_mean) return _ts_op(f, expr, f"ts_cv({f.name},{window})") def ts_autocorr(a: Factor, window: int, lag: int = 1) -> Factor: """Rolling autocorrelation with specified lag.""" - col = pl.col("factor") + col = pl.col(a._col) lagged = col.shift(lag) expr = pl.rolling_corr(col, lagged, window_size=window, min_samples=window, ddof=1) return _ts_op(a, expr, f"ts_autocorr({a.name},{window},{lag})") def ts_count_nans(f: Factor, window: int) -> Factor: - """Count null values in rolling window. - - Returns null for the first (window-1) periods per symbol (warmup). - """ - expr = pl.col("factor").is_null().cast(pl.Int32).rolling_sum(window, min_samples=window) + """Count null values in rolling window.""" + expr = pl.col(f._col).is_null().cast(pl.Int32).rolling_sum(window, min_samples=window) return _ts_op(f, expr, f"ts_count_nans({f.name},{window})") def ts_backfill(f: Factor, window: int, k: int = 1) -> Factor: """Replace null with k-th most recent non-null within lookback window.""" - col = pl.col("factor") + col = pl.col(f._col) if k == 1: expr = col.forward_fill(limit=window) else: @@ -305,113 +300,102 @@ def ts_backfill(f: Factor, window: int, k: int = 1) -> Factor: def inst_tvr(f: Factor, window: int) -> Factor: """sum(|x[t]-x[t-1]|) / sum(|x[t]|) for d-1 most recent periods.""" - col = pl.col("factor") + col = pl.col(f._col) abs_delta = (col - col.shift(1)).abs() numerator = abs_delta.rolling_sum(window - 1, min_samples=window - 1) denominator = col.abs().rolling_sum(window - 1, min_samples=window - 1) - expr = pl.when(denominator < 1e-10).then(0.0).otherwise(numerator / denominator) + expr = pl.when(denominator == 0).then(0.0).otherwise(numerator / denominator) return _ts_op(f, expr, f"inst_tvr({f.name},{window})") def ts_delta_limit(x: Factor, y: Factor, limit_volume: float = 0.1) -> Factor: - """Clip change in x to limit_volume * |y| per period (simplified, input-based).""" - merged = x.df.rename({"factor": "_x"}).join( - y.df.select(["timestamp", "symbol", pl.col("factor").alias("_y")]), - on=["timestamp", "symbol"], how="inner" - ).sort(["symbol", "timestamp"]) - prev = pl.col("_x").shift(1).over("symbol") - limit_val = pl.col("_y").abs() * limit_volume - delta = pl.col("_x") - prev + """Clip change in x to limit_volume * |y| per period.""" + x._check_panel(y) + name = f"ts_delta_limit({x.name},{y.name},{limit_volume})" + xc = pl.col(x._col) + yc = pl.col(y._col) + prev = xc.shift(1).over("symbol") + limit_val = yc.abs() * limit_volume + delta = xc - prev clamped = delta.clip(-limit_val, limit_val) - result = merged.with_columns( - pl.when(prev.is_null()).then(pl.col("_x")).otherwise(prev + clamped).alias("factor") - ).select(["timestamp", "symbol", "factor"]) - return Factor(result, f"ts_delta_limit({x.name},{y.name},{limit_volume})") + expr = pl.when(prev.is_null()).then(xc).otherwise(prev + clamped) + x.panel._add_col(expr.over("symbol"), name) + return Factor(name, x.panel) def ts_regression(y: Factor, x: Factor, window: int, lag: int = 0, rettype: int = 0) -> Factor: """Rolling OLS: y = alpha + beta*x. rettype: 0=resid,1=alpha,2=beta,3=fitted,4=SSE,5=SST,6=R2,7=MSE,8=SE(b),9=SE(a).""" - merged = y.df.rename({"factor": "_y"}).join( - x.df.select(["timestamp", "symbol", pl.col("factor").alias("_x")]), - on=["timestamp", "symbol"], how="inner" - ).sort(["symbol", "timestamp"]) + y._check_panel(x) + panel = y.panel + name = f"ts_regression({y.name},{x.name},{window},{lag},{rettype})" + + uid = f"_reg_{id(y)}_{id(x)}" + xc = pl.col(x._col) + yc = pl.col(y._col) if lag > 0: - merged = merged.with_columns(pl.col("_x").shift(lag).over("symbol")) + lag_name = f"{uid}_xlag" + panel._add_col(xc.shift(lag).over("symbol"), lag_name) + xc = pl.col(lag_name) n = float(window) - xc = pl.col("_x") - yc = pl.col("_y") - - merged = merged.with_columns([ - xc.rolling_sum(window, min_samples=window).over("symbol").alias("_sx"), - yc.rolling_sum(window, min_samples=window).over("symbol").alias("_sy"), - (xc * xc).rolling_sum(window, min_samples=window).over("symbol").alias("_sxx"), - (xc * yc).rolling_sum(window, min_samples=window).over("symbol").alias("_sxy"), - (yc * yc).rolling_sum(window, min_samples=window).over("symbol").alias("_syy"), - ]) - - denom = pl.lit(n) * pl.col("_sxx") - pl.col("_sx") * pl.col("_sx") - beta = pl.when(denom.abs() < 1e-10).then(0.0).otherwise( - (pl.lit(n) * pl.col("_sxy") - pl.col("_sx") * pl.col("_sy")) / denom - ) - alpha_expr = (pl.col("_sy") - beta * pl.col("_sx")) / pl.lit(n) - merged = merged.with_columns([ - denom.alias("_denom"), - beta.alias("_beta"), - alpha_expr.alias("_alpha"), - ]) + panel._add_col(xc.rolling_sum(window, min_samples=window).over("symbol"), f"{uid}_sx") + panel._add_col(yc.rolling_sum(window, min_samples=window).over("symbol"), f"{uid}_sy") + panel._add_col((xc * xc).rolling_sum(window, min_samples=window).over("symbol"), f"{uid}_sxx") + panel._add_col((xc * yc).rolling_sum(window, min_samples=window).over("symbol"), f"{uid}_sxy") + panel._add_col((yc * yc).rolling_sum(window, min_samples=window).over("symbol"), f"{uid}_syy") - merged = merged.with_columns( - (pl.col("_alpha") + pl.col("_beta") * xc).alias("_fitted") - ) - merged = merged.with_columns( - (yc - pl.col("_fitted")).alias("_resid") - ) + sx, sy = pl.col(f"{uid}_sx"), pl.col(f"{uid}_sy") + sxx, sxy, syy = pl.col(f"{uid}_sxx"), pl.col(f"{uid}_sxy"), pl.col(f"{uid}_syy") + + denom_expr = pl.lit(n) * sxx - sx * sx + panel._add_col(denom_expr, f"{uid}_denom") + denom = pl.col(f"{uid}_denom") + + beta_expr = pl.when(denom == 0).then(0.0).otherwise((pl.lit(n) * sxy - sx * sy) / denom) + panel._add_col(beta_expr, f"{uid}_beta") + beta = pl.col(f"{uid}_beta") + + alpha_expr = (sy - beta * sx) / pl.lit(n) + panel._add_col(alpha_expr, f"{uid}_alpha") + alpha_col = pl.col(f"{uid}_alpha") + + fitted_expr = alpha_col + beta * xc + panel._add_col(fitted_expr, f"{uid}_fitted") + fitted = pl.col(f"{uid}_fitted") + + resid_expr = yc - fitted + panel._add_col(resid_expr, f"{uid}_resid") if rettype <= 3: - col_map = {0: "_resid", 1: "_alpha", 2: "_beta", 3: "_fitted"} - out_col = col_map.get(rettype, "_resid") - result = merged.with_columns(pl.col(out_col).alias("factor")) + col_map = {0: f"{uid}_resid", 1: f"{uid}_alpha", 2: f"{uid}_beta", 3: f"{uid}_fitted"} + src = col_map.get(rettype, f"{uid}_resid") + panel._add_col(pl.col(src), name) else: - merged = merged.with_columns([ - (pl.col("_syy") - pl.col("_alpha") * pl.col("_sy") - pl.col("_beta") * pl.col("_sxy")).alias("_sse"), - (pl.col("_syy") - pl.col("_sy") * pl.col("_sy") / pl.lit(n)).alias("_sst"), - ]) + sse_expr = syy - alpha_col * sy - beta * sxy + panel._add_col(sse_expr, f"{uid}_sse") + sse = pl.col(f"{uid}_sse") + + sst_expr = syy - sy * sy / pl.lit(n) + panel._add_col(sst_expr, f"{uid}_sst") + sst = pl.col(f"{uid}_sst") if rettype == 4: - result = merged.with_columns(pl.col("_sse").alias("factor")) + panel._add_col(sse, name) elif rettype == 5: - result = merged.with_columns(pl.col("_sst").alias("factor")) + panel._add_col(sst, name) elif rettype == 6: - result = merged.with_columns( - pl.when(pl.col("_sst").abs() < 1e-10).then(1.0).otherwise( - pl.lit(1.0) - pl.col("_sse") / pl.col("_sst") - ).alias("factor") - ) + panel._add_col(pl.when(sst == 0).then(1.0).otherwise(pl.lit(1.0) - sse / sst), name) elif rettype == 7: - result = merged.with_columns( - pl.when(pl.lit(n - 2) < 1e-10).then(None).otherwise( - pl.col("_sse") / pl.lit(n - 2) - ).alias("factor") - ) + panel._add_col(pl.when(pl.lit(n - 2) == 0).then(None).otherwise(sse / pl.lit(n - 2)), name) elif rettype == 8: - mse = pl.col("_sse") / pl.lit(n - 2) - result = merged.with_columns( - pl.when(pl.col("_denom").abs() < 1e-10).then(None).otherwise( - (mse * pl.lit(n) / pl.col("_denom")).sqrt() - ).alias("factor") - ) + mse = sse / pl.lit(n - 2) + panel._add_col(pl.when(denom == 0).then(None).otherwise((mse * pl.lit(n) / denom).sqrt()), name) elif rettype == 9: - mse = pl.col("_sse") / pl.lit(n - 2) - result = merged.with_columns( - pl.when(pl.col("_denom").abs() < 1e-10).then(None).otherwise( - (mse * pl.col("_sxx") / (pl.lit(n) * pl.col("_denom"))).sqrt() - ).alias("factor") - ) + mse = sse / pl.lit(n - 2) + panel._add_col(pl.when(denom == 0).then(None).otherwise((mse * sxx / (pl.lit(n) * denom)).sqrt()), name) else: - result = merged.with_columns(pl.col("_resid").alias("factor")) + panel._add_col(pl.col(f"{uid}_resid"), name) - result = result.select(["timestamp", "symbol", "factor"]) - return Factor(result, f"ts_regression({y.name},{x.name},{window},{lag},{rettype})") + return Factor(name, panel) diff --git a/elvers/portfolio/__init__.py b/elvers/portfolio/__init__.py new file mode 100644 index 0000000..178a254 --- /dev/null +++ b/elvers/portfolio/__init__.py @@ -0,0 +1 @@ +"""Portfolio construction and optimization.""" diff --git a/elvers/risk/__init__.py b/elvers/risk/__init__.py new file mode 100644 index 0000000..b594fe0 --- /dev/null +++ b/elvers/risk/__init__.py @@ -0,0 +1 @@ +"""Risk management: exposure, limits, VaR.""" diff --git a/elvers/synthesis/__init__.py b/elvers/synthesis/__init__.py new file mode 100644 index 0000000..f52e832 --- /dev/null +++ b/elvers/synthesis/__init__.py @@ -0,0 +1 @@ +"""Multi-factor synthesis: orthogonalization, combination, selection.""" diff --git a/elvers/universe/__init__.py b/elvers/universe/__init__.py new file mode 100644 index 0000000..8216b0c --- /dev/null +++ b/elvers/universe/__init__.py @@ -0,0 +1 @@ +"""Universe selection and instrument filtering.""" diff --git a/pyproject.toml b/pyproject.toml index ac386ba..b5fda97 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta" [project] name = "elvers" dynamic = ["version"] -description = "High-performance multi-factor quantitative framework built on Polars." +description = "Multi-factor alpha research platform for systematic trading." readme = "README.md" license = {file = "LICENSE"} authors = [ @@ -35,7 +35,7 @@ include = ["elvers*"] exclude = ["tests*"] [tool.setuptools.package-data] -"elvers.data" = ["*.csv", "*.parquet"] +"elvers.data.sample" = ["*.csv", "*.parquet"] [tool.setuptools.dynamic] version = {attr = "elvers.__version__"} @@ -44,6 +44,7 @@ version = {attr = "elvers.__version__"} dev = [ "pytest>=7.0", "ruff>=0.4.0", + "pyright>=1.1", "pre-commit>=3.0", ] diff --git a/tests/conftest.py b/tests/conftest.py index f19cd11..d6c99d7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,20 +1,60 @@ from __future__ import annotations + from datetime import date + import polars as pl -from elvers import Factor + +from elvers import Factor, Panel -def make_ts(vals, symbol="X"): +def make_ts(vals, symbol="X", col_name="factor"): rows = [ - {"timestamp": date(2024, 1, d + 1), "symbol": symbol, "factor": v} + {"timestamp": date(2024, 1, d + 1), "symbol": symbol, col_name: v} for d, v in enumerate(vals) ] - return Factor(pl.DataFrame(rows), "test") + panel = Panel(pl.DataFrame(rows)) + return Factor(col_name, panel) -def make_factor(values, day=1): +def make_factor(values, day=1, col_name="factor"): rows = [ - {"timestamp": date(2024, 1, day), "symbol": s, "factor": v} + {"timestamp": date(2024, 1, day), "symbol": s, col_name: v} for s, v in values.items() ] - return Factor(pl.DataFrame(rows), "test") + panel = Panel(pl.DataFrame(rows)) + return Factor(col_name, panel) + + +def make_panel_ts(cols: dict[str, list], symbol="X"): + """Create a Panel with multiple columns sharing the same index. + + Usage: make_panel_ts({"a": [1,2,3], "b": [4,5,6]}) + Returns: (Panel, dict of name->Factor) + """ + n = len(next(iter(cols.values()))) + rows = { + "timestamp": [date(2024, 1, d + 1) for d in range(n)], + "symbol": [symbol] * n, + } + rows.update(cols) + panel = Panel(pl.DataFrame(rows)) + factors = {name: Factor(name, panel) for name in cols} + return panel, factors + + +def make_panel_cs(cols: dict[str, dict[str, float | None]], day=1): + """Create a Panel with multiple columns for cross-sectional tests. + + Usage: make_panel_cs({"f": {"A": 1.0, "B": 2.0}, "g": {"A": 3.0, "B": 4.0}}) + Returns: (Panel, dict of name->Factor) + """ + symbols = list(next(iter(cols.values())).keys()) + rows = { + "timestamp": [date(2024, 1, day)] * len(symbols), + "symbol": symbols, + } + for col_name, values in cols.items(): + rows[col_name] = [values[s] for s in symbols] + panel = Panel(pl.DataFrame(rows)) + factors = {name: Factor(name, panel) for name in cols} + return panel, factors diff --git a/tests/test_base.py b/tests/test_base.py index a4ae314..7fc1716 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -1,53 +1,46 @@ from __future__ import annotations -from datetime import date -import polars as pl + import pytest -from elvers import Factor, add, subtract, multiply, divide, reverse, densify, bucket -from tests.conftest import make_factor +from elvers import add, subtract, multiply, divide, reverse, densify, bucket +from tests.conftest import make_factor, make_panel_cs class TestAdd: def test_null_propagates(self): - a = make_factor({"X": 1.0, "Y": None}) - b = make_factor({"X": 4.0, "Y": 5.0}) - vals = add(a, b).df.sort("symbol")["factor"].to_list() + _, fs = make_panel_cs({"a": {"X": 1.0, "Y": None}, "b": {"X": 4.0, "Y": 5.0}}) + vals = add(fs["a"], fs["b"]).df.sort("symbol")["factor"].to_list() assert vals[0] == pytest.approx(5.0) assert vals[1] is None def test_filter_treats_null_as_zero(self): - a = make_factor({"X": 1.0, "Y": None, "Z": 3.0}) - b = make_factor({"X": 4.0, "Y": 5.0, "Z": None}) - vals = add(a, b, filter=True).df.sort("symbol")["factor"].to_list() + _, fs = make_panel_cs({"a": {"X": 1.0, "Y": None, "Z": 3.0}, "b": {"X": 4.0, "Y": 5.0, "Z": None}}) + vals = add(fs["a"], fs["b"], filter=True).df.sort("symbol")["factor"].to_list() assert vals == pytest.approx([5.0, 5.0, 3.0]) class TestMultiply: def test_filter_treats_null_as_one(self): - a = make_factor({"X": 5.0, "Y": None}) - b = make_factor({"X": 3.0, "Y": 2.0}) - vals = multiply(a, b, filter=True).df.sort("symbol")["factor"].to_list() + _, fs = make_panel_cs({"a": {"X": 5.0, "Y": None}, "b": {"X": 3.0, "Y": 2.0}}) + vals = multiply(fs["a"], fs["b"], filter=True).df.sort("symbol")["factor"].to_list() assert vals == pytest.approx([15.0, 2.0]) class TestSubtract: def test_filter_treats_null_as_zero(self): - a = make_factor({"X": None, "Y": 5.0}) - b = make_factor({"X": 3.0, "Y": None}) - vals = subtract(a, b, filter=True).df.sort("symbol")["factor"].to_list() + _, fs = make_panel_cs({"a": {"X": None, "Y": 5.0}, "b": {"X": 3.0, "Y": None}}) + vals = subtract(fs["a"], fs["b"], filter=True).df.sort("symbol")["factor"].to_list() assert vals == pytest.approx([-3.0, 5.0]) class TestDivide: def test_basic(self): - a = make_factor({"X": 10.0, "Y": 6.0}) - b = make_factor({"X": 2.0, "Y": 3.0}) - vals = divide(a, b).df.sort("symbol")["factor"].to_list() + _, fs = make_panel_cs({"a": {"X": 10.0, "Y": 6.0}, "b": {"X": 2.0, "Y": 3.0}}) + vals = divide(fs["a"], fs["b"]).df.sort("symbol")["factor"].to_list() assert vals == pytest.approx([5.0, 2.0]) def test_zero_denominator_returns_null(self): - a = make_factor({"X": 10.0, "Y": 5.0}) - b = make_factor({"X": 0.0, "Y": 2.0}) - vals = divide(a, b).df.sort("symbol")["factor"].to_list() + _, fs = make_panel_cs({"a": {"X": 10.0, "Y": 5.0}, "b": {"X": 0.0, "Y": 2.0}}) + vals = divide(fs["a"], fs["b"]).df.sort("symbol")["factor"].to_list() assert vals[0] is None assert vals[1] == pytest.approx(2.5) diff --git a/tests/test_cross_sectional.py b/tests/test_cross_sectional.py index adf5207..b906450 100644 --- a/tests/test_cross_sectional.py +++ b/tests/test_cross_sectional.py @@ -74,7 +74,7 @@ def test_demean(self): def test_use_std_gives_unit_variance(self): data = {chr(65+i): float(i*10+5) for i in range(20)} out = normalize(make_factor(data), use_std=True).df["factor"].to_list() - assert sum(out)/len(out) == pytest.approx(0.0, abs=1e-10) + assert sum(out)/len(out) == pytest.approx(0.0) assert sum(v*v for v in out)/len(out) == pytest.approx(1.0, rel=1e-6) def test_limit_clips(self): @@ -101,12 +101,12 @@ def test_null_preserved(self): class TestSignal: def test_net_zero_and_unit_abs_sum(self): vals = signal(make_factor({"A": 1.0, "B": 2.0, "C": 3.0, "D": 4.0})).df["factor"].to_list() - assert sum(vals) == pytest.approx(0.0, abs=1e-10) + assert sum(vals) == pytest.approx(0.0) assert sum(abs(v) for v in vals) == pytest.approx(1.0, rel=1e-6) def test_null_gets_zero(self): vals = signal(make_factor({"A": 10.0, "B": None, "C": 30.0})).df.sort("symbol")["factor"].to_list() - assert vals[1] == pytest.approx(0.0, abs=1e-10) + assert vals[1] == pytest.approx(0.0) class TestWinsorize: diff --git a/tests/test_math.py b/tests/test_math.py index d3394bf..0c8fc90 100644 --- a/tests/test_math.py +++ b/tests/test_math.py @@ -1,19 +1,19 @@ from __future__ import annotations + import math -import polars as pl import pytest from elvers import ( - Factor, log, sqrt, sign, power, signed_power, + log, sqrt, sign, power, signed_power, inverse, s_log_1p, maximum, minimum, where, ) -from tests.conftest import make_factor +from tests.conftest import make_factor, make_panel_cs class TestLog: def test_natural_log(self): vals = log(make_factor({"A": math.e, "B": 1.0})).df.sort("symbol")["factor"].to_list() assert vals[0] == pytest.approx(1.0, rel=1e-6) - assert vals[1] == pytest.approx(0.0, abs=1e-10) + assert vals[1] == pytest.approx(0.0) def test_base_10(self): assert log(make_factor({"A": 100.0}), base=10.0).df["factor"][0] == pytest.approx(2.0, rel=1e-6) @@ -48,7 +48,8 @@ def test_scalar_exp(self): assert vals == pytest.approx([9.0, 4.0]) def test_factor_exp(self): - vals = power(make_factor({"A": 2.0, "B": 3.0}), make_factor({"A": 3.0, "B": 2.0})).df.sort("symbol")["factor"].to_list() + _, fs = make_panel_cs({"base": {"A": 2.0, "B": 3.0}, "exp": {"A": 3.0, "B": 2.0}}) + vals = power(fs["base"], fs["exp"]).df.sort("symbol")["factor"].to_list() assert vals == pytest.approx([8.0, 9.0]) @@ -58,7 +59,8 @@ def test_preserves_sign(self): assert vals == pytest.approx([-9.0, 9.0]) def test_factor_exp(self): - vals = signed_power(make_factor({"A": -4.0, "B": 4.0}), make_factor({"A": 0.5, "B": 0.5})).df.sort("symbol")["factor"].to_list() + _, fs = make_panel_cs({"base": {"A": -4.0, "B": 4.0}, "exp": {"A": 0.5, "B": 0.5}}) + vals = signed_power(fs["base"], fs["exp"]).df.sort("symbol")["factor"].to_list() assert vals == pytest.approx([-2.0, 2.0]) @@ -79,7 +81,7 @@ def test_negative(self): assert s_log_1p(make_factor({"A": -9.0})).df["factor"][0] == pytest.approx(-math.log(10.0), rel=1e-6) def test_zero(self): - assert s_log_1p(make_factor({"A": 0.0})).df["factor"][0] == pytest.approx(0.0, abs=1e-10) + assert s_log_1p(make_factor({"A": 0.0})).df["factor"][0] == pytest.approx(0.0) class TestMaximum: @@ -88,7 +90,8 @@ def test_scalar(self): assert vals == pytest.approx([3.0, 0.0]) def test_factor(self): - vals = maximum(make_factor({"A": 1.0, "B": 5.0}), make_factor({"A": 3.0, "B": 2.0})).df.sort("symbol")["factor"].to_list() + _, fs = make_panel_cs({"a": {"A": 1.0, "B": 5.0}, "b": {"A": 3.0, "B": 2.0}}) + vals = maximum(fs["a"], fs["b"]).df.sort("symbol")["factor"].to_list() assert vals == pytest.approx([3.0, 5.0]) @@ -98,7 +101,8 @@ def test_scalar(self): assert vals == pytest.approx([0.0, -1.0]) def test_factor(self): - vals = minimum(make_factor({"A": 1.0, "B": 5.0}), make_factor({"A": 3.0, "B": 2.0})).df.sort("symbol")["factor"].to_list() + _, fs = make_panel_cs({"a": {"A": 1.0, "B": 5.0}, "b": {"A": 3.0, "B": 2.0}}) + vals = minimum(fs["a"], fs["b"]).df.sort("symbol")["factor"].to_list() assert vals == pytest.approx([1.0, 2.0]) @@ -113,5 +117,6 @@ def test_null_condition(self): assert vals[1] == pytest.approx(10.0) def test_factor_branches(self): - vals = where(make_factor({"A": 1.0, "B": 0.0}), make_factor({"A": 100.0, "B": 200.0}), make_factor({"A": -100.0, "B": -200.0})).df.sort("symbol")["factor"].to_list() + _, fs = make_panel_cs({"cond": {"A": 1.0, "B": 0.0}, "x": {"A": 100.0, "B": 200.0}, "y": {"A": -100.0, "B": -200.0}}) + vals = where(fs["cond"], fs["x"], fs["y"]).df.sort("symbol")["factor"].to_list() assert vals == pytest.approx([100.0, -200.0]) diff --git a/tests/test_neutralization.py b/tests/test_neutralization.py index f361b09..fba4ee2 100644 --- a/tests/test_neutralization.py +++ b/tests/test_neutralization.py @@ -1,109 +1,124 @@ from __future__ import annotations -from datetime import date + import polars as pl import pytest -from elvers import Factor from elvers.ops.neutralization import ( vector_neut, regression_neut, group_neutralize, group_rank, group_zscore, group_scale, group_normalize, group_mean, group_median, group_backfill, ) -from tests.conftest import make_factor +from tests.conftest import make_panel_cs class TestVectorNeut: def test_result_orthogonal(self): - r = vector_neut(make_factor({"A": 1.0, "B": 2.0, "C": 3.0}), make_factor({"A": 3.0, "B": 4.0, "C": 5.0})) + _, fs = make_panel_cs({"f": {"A": 1.0, "B": 2.0, "C": 3.0}, "g": {"A": 3.0, "B": 4.0, "C": 5.0}}) + r = vector_neut(fs["f"], fs["g"]) vals = r.df.sort("symbol")["factor"].to_list() assert sum(v*y for v, y in zip(vals, [3.0, 4.0, 5.0])) == pytest.approx(0.0, abs=1e-8) def test_zero_y_returns_original(self): - vals = vector_neut(make_factor({"A": 5.0, "B": 10.0}), make_factor({"A": 0.0, "B": 0.0})).df.sort("symbol")["factor"].to_list() + _, fs = make_panel_cs({"f": {"A": 5.0, "B": 10.0}, "g": {"A": 0.0, "B": 0.0}}) + vals = vector_neut(fs["f"], fs["g"]).df.sort("symbol")["factor"].to_list() assert vals == pytest.approx([5.0, 10.0]) class TestRegressionNeut: def test_residual_orthogonal_to_x(self): - r = regression_neut(make_factor({"A": 2.0, "B": 4.0, "C": 5.0, "D": 8.0}), make_factor({"A": 1.0, "B": 2.0, "C": 3.0, "D": 4.0})) + _, fs = make_panel_cs({"y": {"A": 2.0, "B": 4.0, "C": 5.0, "D": 8.0}, "x": {"A": 1.0, "B": 2.0, "C": 3.0, "D": 4.0}}) + r = regression_neut(fs["y"], fs["x"]) residuals = r.df.sort("symbol")["factor"].to_list() - assert sum(ri*xi for ri, xi in zip(residuals, [1,2,3,4])) == pytest.approx(0.0, abs=1e-6) + assert sum(ri*xi for ri, xi in zip(residuals, [1, 2, 3, 4])) == pytest.approx(0.0, abs=1e-6) def test_constant_x_returns_demean(self): - vals = regression_neut(make_factor({"A": 10.0, "B": 20.0, "C": 30.0}), make_factor({"A": 5.0, "B": 5.0, "C": 5.0})).df.sort("symbol")["factor"].to_list() + _, fs = make_panel_cs({"y": {"A": 10.0, "B": 20.0, "C": 30.0}, "x": {"A": 5.0, "B": 5.0, "C": 5.0}}) + vals = regression_neut(fs["y"], fs["x"]).df.sort("symbol")["factor"].to_list() assert vals == pytest.approx([-10.0, 0.0, 10.0], abs=1e-6) class TestGroupNeutralize: def test_subtract_group_mean(self): - vals = group_neutralize(make_factor({"A": 10.0, "B": 20.0, "C": 30.0}), make_factor({"A": 1.0, "B": 1.0, "C": 1.0})).df.sort("symbol")["factor"].to_list() + _, fs = make_panel_cs({"f": {"A": 10.0, "B": 20.0, "C": 30.0}, "g": {"A": 1.0, "B": 1.0, "C": 1.0}}) + vals = group_neutralize(fs["f"], fs["g"]).df.sort("symbol")["factor"].to_list() assert vals == pytest.approx([-10.0, 0.0, 10.0], abs=1e-6) def test_two_groups(self): - vals = group_neutralize(make_factor({"A": 10.0, "B": 20.0, "C": 100.0, "D": 200.0}), make_factor({"A": 1.0, "B": 1.0, "C": 2.0, "D": 2.0})).df.sort("symbol")["factor"].to_list() + _, fs = make_panel_cs({"f": {"A": 10.0, "B": 20.0, "C": 100.0, "D": 200.0}, "g": {"A": 1.0, "B": 1.0, "C": 2.0, "D": 2.0}}) + vals = group_neutralize(fs["f"], fs["g"]).df.sort("symbol")["factor"].to_list() assert vals == pytest.approx([-5.0, 5.0, -50.0, 50.0]) class TestGroupRank: def test_within_group(self): - vals = group_rank( - make_factor({"A": 100.0, "B": 0.0, "C": 50.0}), - make_factor({"A": 1.0, "B": 1.0, "C": 1.0}), - ).df.sort("symbol")["factor"].to_list() + _, fs = make_panel_cs({"f": {"A": 100.0, "B": 0.0, "C": 50.0}, "g": {"A": 1.0, "B": 1.0, "C": 1.0}}) + vals = group_rank(fs["f"], fs["g"]).df.sort("symbol")["factor"].to_list() assert vals == pytest.approx([3/3, 1/3, 2/3], rel=1e-6) class TestGroupZscore: def test_mean_zero(self): - vals = group_zscore(make_factor({"A": 10.0, "B": 20.0, "C": 30.0}), make_factor({"A": 1.0, "B": 1.0, "C": 1.0})).df.sort("symbol")["factor"].to_list() + _, fs = make_panel_cs({"f": {"A": 10.0, "B": 20.0, "C": 30.0}, "g": {"A": 1.0, "B": 1.0, "C": 1.0}}) + vals = group_zscore(fs["f"], fs["g"]).df.sort("symbol")["factor"].to_list() assert vals[1] == pytest.approx(0.0, abs=1e-6) def test_constant_returns_zero(self): - vals = group_zscore(make_factor({"A": 5.0, "B": 5.0}), make_factor({"A": 1.0, "B": 1.0})).df.sort("symbol")["factor"].to_list() - assert vals == pytest.approx([0.0, 0.0], abs=1e-10) + _, fs = make_panel_cs({"f": {"A": 5.0, "B": 5.0}, "g": {"A": 1.0, "B": 1.0}}) + vals = group_zscore(fs["f"], fs["g"]).df.sort("symbol")["factor"].to_list() + assert vals == pytest.approx([0.0, 0.0]) class TestGroupScale: def test_range_zero_to_one(self): - vals = group_scale(make_factor({"A": 10.0, "B": 20.0, "C": 30.0}), make_factor({"A": 1.0, "B": 1.0, "C": 1.0})).df.sort("symbol")["factor"].to_list() + _, fs = make_panel_cs({"f": {"A": 10.0, "B": 20.0, "C": 30.0}, "g": {"A": 1.0, "B": 1.0, "C": 1.0}}) + vals = group_scale(fs["f"], fs["g"]).df.sort("symbol")["factor"].to_list() assert vals == pytest.approx([0.0, 0.5, 1.0]) class TestGroupNormalize: def test_abs_sum_equals_target(self): - vals = group_normalize(make_factor({"A": 3.0, "B": -1.0, "C": 2.0}), make_factor({"A": 1.0, "B": 1.0, "C": 1.0}), target=1.0).df.sort("symbol")["factor"].to_list() + _, fs = make_panel_cs({"f": {"A": 3.0, "B": -1.0, "C": 2.0}, "g": {"A": 1.0, "B": 1.0, "C": 1.0}}) + vals = group_normalize(fs["f"], fs["g"], target=1.0).df.sort("symbol")["factor"].to_list() assert sum(abs(v) for v in vals) == pytest.approx(1.0, rel=1e-6) def test_all_zero_returns_zero(self): - vals = group_normalize(make_factor({"A": 0.0, "B": 0.0}), make_factor({"A": 1.0, "B": 1.0})).df.sort("symbol")["factor"].to_list() + _, fs = make_panel_cs({"f": {"A": 0.0, "B": 0.0}, "g": {"A": 1.0, "B": 1.0}}) + vals = group_normalize(fs["f"], fs["g"]).df.sort("symbol")["factor"].to_list() assert vals == [0.0, 0.0] class TestGroupMean: def test_broadcast(self): - vals = group_mean(make_factor({"A": 10.0, "B": 30.0, "C": 100.0, "D": 200.0}), make_factor({"A": 1.0, "B": 1.0, "C": 2.0, "D": 2.0})).df.sort("symbol")["factor"].to_list() + _, fs = make_panel_cs({"f": {"A": 10.0, "B": 30.0, "C": 100.0, "D": 200.0}, "g": {"A": 1.0, "B": 1.0, "C": 2.0, "D": 2.0}}) + vals = group_mean(fs["f"], fs["g"]).df.sort("symbol")["factor"].to_list() assert vals == pytest.approx([20.0, 20.0, 150.0, 150.0]) def test_weighted(self): - vals = group_mean(make_factor({"A": 10.0, "B": 20.0}), make_factor({"A": 1.0, "B": 1.0}), weight=make_factor({"A": 3.0, "B": 1.0})).df.sort("symbol")["factor"].to_list() + _, fs = make_panel_cs({"f": {"A": 10.0, "B": 20.0}, "g": {"A": 1.0, "B": 1.0}, "w": {"A": 3.0, "B": 1.0}}) + vals = group_mean(fs["f"], fs["g"], weight=fs["w"]).df.sort("symbol")["factor"].to_list() assert vals[0] == pytest.approx((10*3 + 20*1) / 4) class TestGroupMedian: def test_broadcast(self): - vals = group_median(make_factor({"A": 10.0, "B": 20.0, "C": 50.0}), make_factor({"A": 1.0, "B": 1.0, "C": 1.0})).df.sort("symbol")["factor"].to_list() + _, fs = make_panel_cs({"f": {"A": 10.0, "B": 20.0, "C": 50.0}, "g": {"A": 1.0, "B": 1.0, "C": 1.0}}) + vals = group_median(fs["f"], fs["g"]).df.sort("symbol")["factor"].to_list() assert all(v == pytest.approx(20.0) for v in vals) class TestGroupBackfill: def test_fills_null_with_group_mean(self): - vals = group_backfill(make_factor({"A": 10.0, "B": 20.0, "C": None}), make_factor({"A": 1.0, "B": 1.0, "C": 1.0})).df.sort("symbol")["factor"].to_list() + _, fs = make_panel_cs({"f": {"A": 10.0, "B": 20.0, "C": None}, "g": {"A": 1.0, "B": 1.0, "C": 1.0}}) + vals = group_backfill(fs["f"], fs["g"]).df.sort("symbol")["factor"].to_list() assert vals[2] == pytest.approx(15.0) def test_no_nulls_unchanged(self): - vals = group_backfill(make_factor({"A": 10.0, "B": 20.0}), make_factor({"A": 1.0, "B": 1.0})).df.sort("symbol")["factor"].to_list() + _, fs = make_panel_cs({"f": {"A": 10.0, "B": 20.0}, "g": {"A": 1.0, "B": 1.0}}) + vals = group_backfill(fs["f"], fs["g"]).df.sort("symbol")["factor"].to_list() assert vals == pytest.approx([10.0, 20.0]) def test_winsorization_affects_fill(self): - r_tight = group_backfill(make_factor({"A": 10.0, "B": 10.0, "C": 10.0, "D": 1000.0, "E": None}), make_factor({"A": 1.0, "B": 1.0, "C": 1.0, "D": 1.0, "E": 1.0}), std=1.0) - r_loose = group_backfill(make_factor({"A": 10.0, "B": 10.0, "C": 10.0, "D": 1000.0, "E": None}), make_factor({"A": 1.0, "B": 1.0, "C": 1.0, "D": 1.0, "E": 1.0}), std=10.0) + _, fs1 = make_panel_cs({"f": {"A": 10.0, "B": 10.0, "C": 10.0, "D": 1000.0, "E": None}, "g": {"A": 1.0, "B": 1.0, "C": 1.0, "D": 1.0, "E": 1.0}}) + _, fs2 = make_panel_cs({"f": {"A": 10.0, "B": 10.0, "C": 10.0, "D": 1000.0, "E": None}, "g": {"A": 1.0, "B": 1.0, "C": 1.0, "D": 1.0, "E": 1.0}}) + r_tight = group_backfill(fs1["f"], fs1["g"], std=1.0) + r_loose = group_backfill(fs2["f"], fs2["g"], std=10.0) assert r_tight.df.filter(pl.col("symbol") == "E")["factor"][0] < r_loose.df.filter(pl.col("symbol") == "E")["factor"][0] diff --git a/tests/test_timeseries.py b/tests/test_timeseries.py index 045550d..139aa15 100644 --- a/tests/test_timeseries.py +++ b/tests/test_timeseries.py @@ -1,10 +1,11 @@ from __future__ import annotations + from datetime import date import math import polars as pl import pytest from elvers import ( - Factor, + Panel, ts_delay, ts_delta, ts_mean, ts_sum, ts_std_dev, ts_min, ts_max, ts_median, ts_rank, ts_skewness, ts_kurtosis, ts_zscore, ts_corr, ts_covariance, ts_product, @@ -15,7 +16,7 @@ kth_element, last_diff_value, inst_tvr, ts_delta_limit, ts_regression, trade_when, ) -from tests.conftest import make_ts +from tests.conftest import make_ts, make_panel_ts def _last(result): @@ -97,17 +98,14 @@ def test_population(self): class TestTsCorr: def test_perfect(self): - a = make_ts([1.0, 2.0, 3.0, 4.0, 5.0]) - b = make_ts([10.0, 20.0, 30.0, 40.0, 50.0]) - assert _last(ts_corr(a, b, 5))[0] == pytest.approx(1.0, rel=1e-6) + _, fs = make_panel_ts({"a": [1.0, 2.0, 3.0, 4.0, 5.0], "b": [10.0, 20.0, 30.0, 40.0, 50.0]}) + assert _last(ts_corr(fs["a"], fs["b"], 5))[0] == pytest.approx(1.0, rel=1e-6) class TestTsCovariance: def test_population_cov(self): - a = make_ts([2.0, 4.0, 6.0, 8.0, 10.0]) - b = make_ts([1.0, 3.0, 5.0, 7.0, 9.0]) - # ddof=0 (population covariance): sum((xi-mx)(yi-my))/n = 40/5 = 8.0 - assert _last(ts_covariance(a, b, 5))[0] == pytest.approx(8.0, rel=1e-6) + _, fs = make_panel_ts({"a": [2.0, 4.0, 6.0, 8.0, 10.0], "b": [1.0, 3.0, 5.0, 7.0, 9.0]}) + assert _last(ts_covariance(fs["a"], fs["b"], 5))[0] == pytest.approx(8.0, rel=1e-6) class TestTsProduct: @@ -138,8 +136,11 @@ def test_counter(self): assert ts_step(make_ts([10.0, 20.0, 30.0, 40.0, 50.0])).df["factor"].to_list() == [1, 2, 3, 4, 5] def test_per_symbol(self): - rows = [{"timestamp": date(2024, 1, d+1), "symbol": s, "factor": 0.0} for d in range(3) for s in ["A", "B"]] - a_vals = ts_step(Factor(pl.DataFrame(rows), "t")).df.filter(pl.col("symbol") == "A").sort("timestamp")["factor"].to_list() + rows = [{"timestamp": date(2024, 1, d+1), "symbol": s, "val": 0.0} for d in range(3) for s in ["A", "B"]] + panel = Panel(pl.DataFrame(rows)) + from elvers import Factor + f = Factor("val", panel) + a_vals = ts_step(f).df.filter(pl.col("symbol") == "A").sort("timestamp")["factor"].to_list() assert a_vals == [1, 2, 3] @@ -174,7 +175,7 @@ def test_min_max_normalize(self): assert _last(ts_scale(make_ts([2.0, 4.0, 6.0, 8.0, 10.0]), 5))[0] == pytest.approx(1.0) def test_constant_returns_zero(self): - assert _last(ts_scale(make_ts([5.0, 5.0, 5.0]), 3))[0] == pytest.approx(0.0, abs=1e-10) + assert _last(ts_scale(make_ts([5.0, 5.0, 5.0]), 3))[0] == pytest.approx(0.0) class TestTsPercentile: @@ -253,19 +254,19 @@ def test_op_md_example(self): assert vals[0] == pytest.approx((1+2+3+3) / (101+99+102+105), rel=1e-4) def test_constant_zero(self): - assert _last(inst_tvr(make_ts([10.0]*5), 5))[0] == pytest.approx(0.0, abs=1e-10) + assert _last(inst_tvr(make_ts([10.0]*5), 5))[0] == pytest.approx(0.0) class TestTsDeltaLimit: def test_clamps_delta(self): - x = make_ts([100.0, 120.0, 110.0, 130.0, 125.0]) - y = make_ts([1000.0]*5) - vals = ts_delta_limit(x, y, limit_volume=0.01).df["factor"].to_list() + _, fs = make_panel_ts({"x": [100.0, 120.0, 110.0, 130.0, 125.0], "y": [1000.0]*5}) + vals = ts_delta_limit(fs["x"], fs["y"], limit_volume=0.01).df["factor"].to_list() assert vals[0] == pytest.approx(100.0) assert vals[1] == pytest.approx(110.0) def test_no_clamping_needed(self): - vals = ts_delta_limit(make_ts([100.0, 101.0, 102.0]), make_ts([1000.0]*3), limit_volume=0.1).df["factor"].to_list() + _, fs = make_panel_ts({"x": [100.0, 101.0, 102.0], "y": [1000.0]*3}) + vals = ts_delta_limit(fs["x"], fs["y"], limit_volume=0.1).df["factor"].to_list() assert vals[1] == pytest.approx(101.0) @@ -277,66 +278,70 @@ def _ols(self, y, x): sxy = sum(a*b for a, b in zip(x, y)) syy = sum(b*b for b in y) denom = n*sxx - sx*sx - beta = (n*sxy - sx*sy) / denom if abs(denom) > 1e-10 else 0.0 + beta = (n*sxy - sx*sy) / denom if denom != 0 else 0.0 alpha = (sy - beta*sx) / n sse = syy - alpha*sy - beta*sxy sst = syy - sy*sy/n return {"beta": beta, "alpha": alpha, "fitted": alpha + beta*x[-1], "resid": y[-1] - (alpha + beta*x[-1]), "sse": sse, "sst": sst, - "r2": 1 - sse/sst if abs(sst) > 1e-10 else 1.0} + "r2": 1 - sse/sst if sst != 0 else 1.0} def test_all_rettypes(self): - y, x = [3.0, 5.0, 4.0, 8.0, 7.0], [1.0, 2.0, 3.0, 4.0, 5.0] - e = self._ols(y, x) + y_vals, x_vals = [3.0, 5.0, 4.0, 8.0, 7.0], [1.0, 2.0, 3.0, 4.0, 5.0] + e = self._ols(y_vals, x_vals) + _, fs = make_panel_ts({"y": y_vals, "x": x_vals}) for rt, key in [(0, "resid"), (1, "alpha"), (2, "beta"), (3, "fitted"), (4, "sse"), (5, "sst"), (6, "r2")]: - val = _last(ts_regression(make_ts(y), make_ts(x), 5, rettype=rt))[-1] - assert val == pytest.approx(e[key], rel=1e-6, abs=1e-10), f"rettype={rt}" + val = _last(ts_regression(fs["y"], fs["x"], 5, rettype=rt))[-1] + assert val == pytest.approx(e[key], rel=1e-6), f"rettype={rt}" def test_sse_sst_r2_identity(self): - y_f, x_f = make_ts([3.0, 5.0, 4.0, 8.0, 7.0]), make_ts([1.0, 2.0, 3.0, 4.0, 5.0]) - sse = _last(ts_regression(y_f, x_f, 5, rettype=4))[-1] - sst = _last(ts_regression(y_f, x_f, 5, rettype=5))[-1] - r2 = _last(ts_regression(y_f, x_f, 5, rettype=6))[-1] + _, fs = make_panel_ts({"y": [3.0, 5.0, 4.0, 8.0, 7.0], "x": [1.0, 2.0, 3.0, 4.0, 5.0]}) + sse = _last(ts_regression(fs["y"], fs["x"], 5, rettype=4))[-1] + sst = _last(ts_regression(fs["y"], fs["x"], 5, rettype=5))[-1] + r2 = _last(ts_regression(fs["y"], fs["x"], 5, rettype=6))[-1] assert r2 == pytest.approx(1.0 - sse/sst, rel=1e-10) def test_constant_x_zero_beta(self): - assert _last(ts_regression(make_ts([10.0, 20.0, 30.0, 40.0, 50.0]), make_ts([5.0]*5), 5, rettype=2))[0] == pytest.approx(0.0, abs=1e-6) + _, fs = make_panel_ts({"y": [10.0, 20.0, 30.0, 40.0, 50.0], "x": [5.0]*5}) + assert _last(ts_regression(fs["y"], fs["x"], 5, rettype=2))[0] == pytest.approx(0.0, abs=1e-6) def test_window2_mse_null(self): - r = ts_regression(make_ts([3.0, 5.0]), make_ts([1.0, 2.0]), 2, rettype=7) + _, fs = make_panel_ts({"y": [3.0, 5.0], "x": [1.0, 2.0]}) + r = ts_regression(fs["y"], fs["x"], 2, rettype=7) assert r.df.filter(pl.col("timestamp") == date(2024, 1, 2))["factor"][0] is None def test_lag_shifts_x(self): - y = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0] - x = [10.0, 20.0, 30.0, 40.0, 50.0, 60.0] - # With lag=1, x is shifted by 1: regression uses x[t-1] to predict y[t] - # Effective pairs for window=4 at t=6: y=[3,4,5,6], x_lagged=[20,30,40,50] - r_beta = ts_regression(make_ts(y), make_ts(x), 4, lag=1, rettype=2) + _, fs = make_panel_ts({"y": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], "x": [10.0, 20.0, 30.0, 40.0, 50.0, 60.0]}) + r_beta = ts_regression(fs["y"], fs["x"], 4, lag=1, rettype=2) beta = r_beta.df.filter(pl.col("timestamp") == date(2024, 1, 6))["factor"][0] - # y = [3,4,5,6], x = [20,30,40,50] -> beta = cov/var = 10*1/(10*100) = 0.1 assert beta == pytest.approx(0.1, rel=1e-4) class TestTradeWhen: def test_hold(self): - vals = trade_when(make_ts([1.0, 0.0, 0.0, 1.0, 0.0]), make_ts([10.0, 20.0, 30.0, 40.0, 50.0]), make_ts([0.0]*5)).df["factor"].to_list() + _, fs = make_panel_ts({"trigger": [1.0, 0.0, 0.0, 1.0, 0.0], "alpha": [10.0, 20.0, 30.0, 40.0, 50.0], "exit": [0.0]*5}) + vals = trade_when(fs["trigger"], fs["alpha"], fs["exit"]).df["factor"].to_list() assert vals == pytest.approx([10.0, 10.0, 10.0, 40.0, 40.0]) def test_exit_clears(self): - vals = trade_when(make_ts([1.0, 0.0, 0.0, 0.0, 0.0]), make_ts([10.0]*5), make_ts([0.0, 0.0, 1.0, 0.0, 0.0])).df["factor"].to_list() + _, fs = make_panel_ts({"trigger": [1.0, 0.0, 0.0, 0.0, 0.0], "alpha": [10.0]*5, "exit": [0.0, 0.0, 1.0, 0.0, 0.0]}) + vals = trade_when(fs["trigger"], fs["alpha"], fs["exit"]).df["factor"].to_list() assert vals[:2] == pytest.approx([10.0, 10.0]) assert all(v is None for v in vals[2:]) def test_exit_then_retrade(self): - vals = trade_when(make_ts([1.0, 0.0, 0.0, 1.0, 0.0]), make_ts([10.0, 20.0, 30.0, 40.0, 50.0]), make_ts([0.0, 0.0, 1.0, 0.0, 0.0])).df["factor"].to_list() + _, fs = make_panel_ts({"trigger": [1.0, 0.0, 0.0, 1.0, 0.0], "alpha": [10.0, 20.0, 30.0, 40.0, 50.0], "exit": [0.0, 0.0, 1.0, 0.0, 0.0]}) + vals = trade_when(fs["trigger"], fs["alpha"], fs["exit"]).df["factor"].to_list() assert vals[2] is None assert vals[3] == pytest.approx(40.0) def test_exit_takes_precedence(self): - vals = trade_when(make_ts([1.0, 1.0]), make_ts([10.0, 20.0]), make_ts([0.0, 1.0])).df["factor"].to_list() + _, fs = make_panel_ts({"trigger": [1.0, 1.0], "alpha": [10.0, 20.0], "exit": [0.0, 1.0]}) + vals = trade_when(fs["trigger"], fs["alpha"], fs["exit"]).df["factor"].to_list() assert vals[1] is None def test_no_initial_trigger(self): - vals = trade_when(make_ts([0.0, 0.0, 1.0, 0.0]), make_ts([10.0, 20.0, 30.0, 40.0]), make_ts([0.0]*4)).df["factor"].to_list() + _, fs = make_panel_ts({"trigger": [0.0, 0.0, 1.0, 0.0], "alpha": [10.0, 20.0, 30.0, 40.0], "exit": [0.0]*4}) + vals = trade_when(fs["trigger"], fs["alpha"], fs["exit"]).df["factor"].to_list() assert vals[0] is None and vals[1] is None assert vals[2] == pytest.approx(30.0)