Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 22 additions & 73 deletions docs/geneva/deployment/dependency-verification.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ description: Diagnose and resolve package version mismatches between local and R
icon: magnifying-glass-chart
---

import { PyQuickFixManifest, PyEnvVarsViaCluster, PyPipManifest, PyCondaClusterPath, PyCondaClusterInline } from '/snippets/geneva_dependency_verification.mdx';

When running Geneva UDFs on Ray, your code is serialized locally and executed on remote workers. If the worker environment differs from your local environment, you may encounter subtle and difficult-to-debug errors.

## Example environment mismatch errors
Expand Down Expand Up @@ -42,10 +44,11 @@ If you are encountering a hang or exception you can use the following diagnosis
</Step>
<Step>
**Fix with manifest** for quick testing:
```python
from geneva.manifest.builder import GenevaManifestBuilder
manifest = GenevaManifestBuilder.create("fix").pip(["numpy==1.26.4"]).build()
```
<CodeGroup>
<CodeBlock filename="Python" language="python" icon="python">
{PyQuickFixManifest}
</CodeBlock>
</CodeGroup>
</Step>
<Step>
**OPTIONAL: Build custom image** for production (if using KubeRay).
Expand Down Expand Up @@ -146,12 +149,12 @@ If critical environment variables are missing on workers, you can pass them via

<CodeGroup>
```python Python icon="python"
from geneva.manifest.builder import GenevaManifestBuilder
from geneva.manifest.builder import PipManifestBuilder
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This little bit is still untested because the env_vars method was just added today; once it's in the next release we can pull it out into a snippet as well.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

both of these apis are valid but pip is preferred.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope, GenevaManifestBuilder doesn't exist anymore, that's what got me started on this whole thing anyway 😆

import os

manifest = (
GenevaManifestBuilder.create("my-manifest")
.env({
PipManifestBuilder.create("my-manifest")
.env_vars({
"AWS_ACCESS_KEY_ID": os.environ["AWS_ACCESS_KEY_ID"],
"AWS_SECRET_ACCESS_KEY": os.environ["AWS_SECRET_ACCESS_KEY"],
"MY_API_KEY": os.environ["MY_API_KEY"],
Expand All @@ -164,23 +167,9 @@ manifest = (
**Option 2: Via Cluster Configuration**

<CodeGroup>
```python Python icon="python"
from geneva.cluster.builder import GenevaClusterBuilder
import os

cluster = (
GenevaClusterBuilder.create("my-cluster")
.ray_init_kwargs({
"runtime_env": {
"env_vars": {
"AWS_ACCESS_KEY_ID": os.environ["AWS_ACCESS_KEY_ID"],
"AWS_SECRET_ACCESS_KEY": os.environ["AWS_SECRET_ACCESS_KEY"],
}
}
})
.build()
)
```
<CodeBlock filename="Python" language="python" icon="python">
{PyEnvVarsViaCluster}
</CodeBlock>
</CodeGroup>

<Warning>
Expand Down Expand Up @@ -226,25 +215,9 @@ Watch for major version differences (NumPy 1.x vs 2.x) and PyTorch version misma
Specify packages in a Geneva manifest for a quick fix:

<CodeGroup>
```python Python icon="python"
from geneva.manifest.builder import GenevaManifestBuilder

manifest = (
GenevaManifestBuilder.create("my-manifest")
.pip([
"numpy==1.26.4",
"torch==2.0.1",
"attrs==23.2.0",
])
.build()
)

# Then use with db.context()
conn = geneva.connect("s3://my-bucket/my-db")
conn.define_manifest("my-manifest", manifest)
with conn.context(cluster="my-cluster", manifest="my-manifest"):
conn.open_table("my-table").backfill("my-column")
```
<CodeBlock filename="Python" language="python" icon="python">
{PyPipManifest}
</CodeBlock>
</CodeGroup>

*Pros*: Quick, reusable across sessions, stored with your database.
Expand Down Expand Up @@ -294,41 +267,17 @@ spec:
Use a conda environment on workers via the cluster builder:

<CodeGroup>
```python Python icon="python"
from geneva.cluster.builder import GenevaClusterBuilder

cluster = (
GenevaClusterBuilder.create("my-cluster")
.ray_init_kwargs({
"runtime_env": {"conda": "environment.yml"}
})
.build()
)
```
<CodeBlock filename="Python" language="python" icon="python">
{PyCondaClusterPath}
</CodeBlock>
</CodeGroup>

Or specify conda channels and dependencies inline:

<CodeGroup>
```python Python icon="python"
cluster = (
GenevaClusterBuilder.create("my-cluster")
.ray_init_kwargs({
"runtime_env": {
"conda": {
"channels": ["conda-forge"],
"dependencies": [
"python=3.10",
"ffmpeg<8",
"torchvision=0.22.1"
]
},
"config": {"eager_install": True}
}
})
.build()
)
```
<CodeBlock filename="Python" language="python" icon="python">
{PyCondaClusterInline}
</CodeBlock>
</CodeGroup>

*Pros*: Best for complex dependencies with native libraries (ffmpeg, CUDA).
Expand Down
12 changes: 6 additions & 6 deletions docs/snippets/basic_usage.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,6 @@ export const TsBasicVectorSearchQ4 = "// Who are the strongest characters?\ncons

export const TsDataLoad = "const data = JSON.parse(fs.readFileSync(dataPath, \"utf-8\"));\n";

export const RsBasicCreateTablePandas = "let table_copy = db\n .create_table(\"camelot_copy\", to_reader(&data)?)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await?;\n";

export const RsBasicCreateTablePolars = "let table_alt = db\n .create_table(\"camelot_alt\", to_reader(&data)?)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await?;\n";

export const RsConnect = "let uri = \"ex_lancedb\";\nlet db = connect(uri).execute().await.unwrap();\n";

export const RsBasicAddColumns = "table\n .add_columns(\n NewColumnTransform::SqlExpressions(vec![(\n \"power\".to_string(),\n \"cast(((stats.strength + stats.courage + stats.magic + stats.wisdom) / 4.0) as float)\"\n .to_string(),\n )]),\n None,\n )\n .await\n .unwrap();\n";

export const RsBasicAddData = "let magical_characters = vec![\n Character {\n id: 9,\n name: \"Morgan le Fay\".to_string(),\n role: \"Sorceress\".to_string(),\n description: \"A powerful enchantress, Arthur's half-sister, and a complex figure who oscillates between aiding and opposing Camelot.\".to_string(),\n vector: [0.10, 0.84, 0.25, 0.70],\n stats: Stats {\n strength: 2,\n courage: 3,\n magic: 5,\n wisdom: 4,\n },\n },\n Character {\n id: 10,\n name: \"The Lady of the Lake\".to_string(),\n role: \"Mystical Guardian\".to_string(),\n description: \"A mysterious supernatural figure associated with Avalon, known for giving Arthur the sword Excalibur.\".to_string(),\n vector: [0.00, 0.90, 0.58, 0.88],\n stats: Stats {\n strength: 2,\n courage: 3,\n magic: 5,\n wisdom: 5,\n },\n },\n];\ntable\n .add(characters_to_reader(camelot_schema(), &magical_characters))\n .execute()\n .await\n .unwrap();\n";
Expand All @@ -80,6 +74,10 @@ export const RsBasicCreateEmptyTable = "let schema = Arc::new(Schema::new(vec![\

export const RsBasicCreateTable = "let mut table = db\n .create_table(\"camelot\", characters_to_reader(schema.clone(), &data))\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n";

export const RsBasicCreateTablePandas = "let table_copy = db\n .create_table(\"camelot_copy\", to_reader(&data)?)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await?;\n";

export const RsBasicCreateTablePolars = "let table_alt = db\n .create_table(\"camelot_alt\", to_reader(&data)?)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await?;\n";

export const RsBasicDeleteRows = "table.delete(\"role = 'Traitor Knight'\").await.unwrap();\n";

export const RsBasicDropColumns = "table.drop_columns(&[\"power\"]).await.unwrap();\n";
Expand All @@ -100,5 +98,7 @@ export const RsBasicVectorSearchQ3 = "// Who are the strongest characters?\nlet

export const RsBasicVectorSearchQ4 = "// Who are the strongest characters?\nlet r4 = table\n .query()\n .select(Select::Columns(vec![\n \"name\".to_string(),\n \"role\".to_string(),\n \"description\".to_string(),\n \"power\".to_string(),\n ]))\n .execute()\n .await\n .unwrap()\n .try_collect::<Vec<_>>()\n .await\n .unwrap();\nprintln!(\"{r4:?}\");\n";

export const RsConnect = "let uri = \"ex_lancedb\";\nlet db = connect(uri).execute().await.unwrap();\n";

export const RsDataLoad = "let data: Vec<Character> =\n serde_json::from_str(&fs::read_to_string(camelot_json_path()).unwrap()).unwrap();\n";

16 changes: 16 additions & 0 deletions docs/snippets/geneva_dependency_verification.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{/* Auto-generated by scripts/mdx_snippets_gen.py. Do not edit manually. */}

export const PyCondaManifestInline = "from geneva.manifest.builder import CondaManifestBuilder\n\nmanifest = (\n CondaManifestBuilder.create(\"my-manifest\")\n .conda({\n \"channels\": [\"conda-forge\"],\n \"dependencies\": [\n \"python=3.10\",\n \"ffmpeg<8\",\n \"torchvision=0.22.1\",\n ],\n })\n .build()\n)\n";
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit:
using triple quotes """" or ''' would allow this to be formatted and more readable.

Copy link
Contributor Author

@dantasse dantasse Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a feature-request issue: #197 (@prrao87 idk where you're tracking these, and it's not critical anyway bc the snippets are readable in the docs, but it's there so you can track it however you like)


export const PyCondaManifestPath = "from geneva.manifest.builder import CondaManifestBuilder\n\nmanifest = (\n CondaManifestBuilder.create(\"my-manifest\")\n .conda_environment_path(\"environment.yml\")\n .build()\n)\n";

export const PyCondaClusterInline = "from geneva.cluster.builder import KubeRayClusterBuilder\n\ncluster = (\n KubeRayClusterBuilder.create(\"my-cluster\")\n .ray_init_kwargs({\n \"runtime_env\": {\n \"conda\": {\n \"channels\": [\"conda-forge\"],\n \"dependencies\": [\n \"python=3.10\",\n \"ffmpeg<8\",\n \"torchvision=0.22.1\",\n ],\n },\n \"config\": {\"eager_install\": True},\n }\n })\n .build()\n)\n";

export const PyCondaClusterPath = "from geneva.cluster.builder import KubeRayClusterBuilder\n\ncluster = (\n KubeRayClusterBuilder.create(\"my-cluster\")\n .ray_init_kwargs({\n \"runtime_env\": {\"conda\": \"environment.yml\"}\n })\n .build()\n)\n";

export const PyEnvVarsViaCluster = "from geneva.cluster.builder import KubeRayClusterBuilder\nimport os\n\ncluster = (\n KubeRayClusterBuilder.create(\"my-cluster\")\n .ray_init_kwargs({\n \"runtime_env\": {\n \"env_vars\": {\n \"AWS_ACCESS_KEY_ID\": os.environ[\"AWS_ACCESS_KEY_ID\"],\n \"AWS_SECRET_ACCESS_KEY\": os.environ[\"AWS_SECRET_ACCESS_KEY\"],\n }\n }\n })\n .build()\n)\n";

export const PyPipManifest = "import geneva\nfrom geneva.manifest.builder import PipManifestBuilder\n\nmanifest = (\n PipManifestBuilder.create(\"my-manifest\")\n .pip([\n \"numpy==1.26.4\",\n \"torch==2.0.1\",\n \"attrs==23.2.0\",\n ])\n .build()\n)\n\nconn = geneva.connect(\"s3://my-bucket/my-db\")\nconn.define_manifest(\"my-manifest\", manifest)\nwith conn.context(cluster=\"my-cluster\", manifest=\"my-manifest\"):\n conn.open_table(\"my-table\").backfill(\"my-column\")\n";

export const PyQuickFixManifest = "from geneva.manifest.builder import PipManifestBuilder\n\nmanifest = PipManifestBuilder.create(\"fix\").pip([\"numpy==1.26.4\"]).build()\n";

4 changes: 2 additions & 2 deletions docs/snippets/search.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ export const PyBasicHybridSearch = "data = [\n {\"text\": \"rebel spaceships

export const PyBasicHybridSearchAsync = "uri = \"data/sample-lancedb\"\nasync_db = await lancedb.connect_async(uri)\ndata = [\n {\"text\": \"rebel spaceships striking from a hidden base\"},\n {\"text\": \"have won their first victory against the evil Galactic Empire\"},\n {\"text\": \"during the battle rebel spies managed to steal secret plans\"},\n {\"text\": \"to the Empire's ultimate weapon the Death Star\"},\n]\nasync_tbl = await async_db.create_table(\"documents_async\", schema=Documents)\n# ingest docs with auto-vectorization\nawait async_tbl.add(data)\n# Create a fts index before the hybrid search\nawait async_tbl.create_index(\"text\", config=FTS())\ntext_query = \"flower moon\"\n# hybrid search with default re-ranker\nawait (await async_tbl.search(\"flower moon\", query_type=\"hybrid\")).to_pandas()\n";

export const PyClassDefinition = "class Metadata(BaseModel):\n source: str\n timestamp: datetime\n\n\nclass Document(BaseModel):\n content: str\n meta: Metadata\n\n\nclass LanceSchema(LanceModel):\n id: str\n vector: Vector(1536)\n payload: Document\n";

export const PyClassDocuments = "class Documents(LanceModel):\n vector: Vector(embeddings.ndims()) = embeddings.VectorField()\n text: str = embeddings.SourceField()\n";

export const PyClassDefinition = "class Metadata(BaseModel):\n source: str\n timestamp: datetime\n\n\nclass Document(BaseModel):\n content: str\n meta: Metadata\n\n\nclass LanceSchema(LanceModel):\n id: str\n vector: Vector(1536)\n payload: Document\n";

export const PyCreateTableAsyncWithNestedSchema = "# Let's add 100 sample rows to our dataset\ndata = [\n LanceSchema(\n id=f\"id{i}\",\n vector=np.random.randn(1536),\n payload=Document(\n content=f\"document{i}\",\n meta=Metadata(source=f\"source{i % 10}\", timestamp=datetime.now()),\n ),\n )\n for i in range(100)\n]\n\nasync_tbl = await async_db.create_table(\n \"documents_async\", data=data, mode=\"overwrite\"\n)\n";

export const PyCreateTableWithNestedSchema = "# Let's add 100 sample rows to our dataset\ndata = [\n LanceSchema(\n id=f\"id{i}\",\n vector=np.random.randn(1536),\n payload=Document(\n content=f\"document{i}\",\n meta=Metadata(source=f\"source{i % 10}\", timestamp=datetime.now()),\n ),\n )\n for i in range(100)\n]\n\n# Synchronous client\ntbl = db.create_table(\"documents\", data=data, mode=\"overwrite\")\n";
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name = "docs"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.13"
requires-python = ">=3.12,<3.14"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Geneva needs 3.12 :(

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm adding 3.13 right now.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and actually has been supporting 3.10

but this looks good to me.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

woohoo! ok, sounds good.

dependencies = [
"lancedb>=0.29.2",
"lance-namespace>=0.5.2",
Expand All @@ -13,4 +13,5 @@ dependencies = [
"pytest>=9.0.1",
"pytest-asyncio>=1.3.0",
"Pillow>=11.0.0",
"geneva>=0.11.0",
]
106 changes: 106 additions & 0 deletions tests/py/test_geneva_dependency_verification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors

import os

import pytest


def test_quick_fix_manifest():
# --8<-- [start:quick_fix_manifest]
from geneva.manifest.builder import PipManifestBuilder

manifest = PipManifestBuilder.create("fix").pip(["numpy==1.26.4"]).build()
# --8<-- [end:quick_fix_manifest]
assert manifest.pip == ["numpy==1.26.4"]


def test_env_vars_via_cluster(monkeypatch):
monkeypatch.setenv("AWS_ACCESS_KEY_ID", "test-key-id")
monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "test-secret")

# --8<-- [start:env_vars_via_cluster]
from geneva.cluster.builder import KubeRayClusterBuilder
import os

cluster = (
KubeRayClusterBuilder.create("my-cluster")
.ray_init_kwargs({
"runtime_env": {
"env_vars": {
"AWS_ACCESS_KEY_ID": os.environ["AWS_ACCESS_KEY_ID"],
"AWS_SECRET_ACCESS_KEY": os.environ["AWS_SECRET_ACCESS_KEY"],
}
}
})
.build()
)
# --8<-- [end:env_vars_via_cluster]
assert cluster.kuberay.ray_init_kwargs["runtime_env"]["env_vars"]["AWS_ACCESS_KEY_ID"] == "test-key-id"


def test_pip_manifest(monkeypatch):
from unittest.mock import MagicMock
mock_conn = MagicMock()
monkeypatch.setattr("geneva.connect", MagicMock(return_value=mock_conn))

# --8<-- [start:pip_manifest]
import geneva
from geneva.manifest.builder import PipManifestBuilder

manifest = (
PipManifestBuilder.create("my-manifest")
.pip([
"numpy==1.26.4",
"torch==2.0.1",
"attrs==23.2.0",
])
.build()
)

conn = geneva.connect("s3://my-bucket/my-db")
conn.define_manifest("my-manifest", manifest)
with conn.context(cluster="my-cluster", manifest="my-manifest"):
conn.open_table("my-table").backfill("my-column")
# --8<-- [end:pip_manifest]
assert "numpy==1.26.4" in manifest.pip


def test_conda_cluster_path():
# --8<-- [start:conda_cluster_path]
from geneva.cluster.builder import KubeRayClusterBuilder

cluster = (
KubeRayClusterBuilder.create("my-cluster")
.ray_init_kwargs({
"runtime_env": {"conda": "environment.yml"}
})
.build()
)
# --8<-- [end:conda_cluster_path]
assert cluster.kuberay.ray_init_kwargs["runtime_env"]["conda"] == "environment.yml"


def test_conda_cluster_inline():
# --8<-- [start:conda_cluster_inline]
from geneva.cluster.builder import KubeRayClusterBuilder

cluster = (
KubeRayClusterBuilder.create("my-cluster")
.ray_init_kwargs({
"runtime_env": {
"conda": {
"channels": ["conda-forge"],
"dependencies": [
"python=3.10",
"ffmpeg<8",
"torchvision=0.22.1",
],
},
"config": {"eager_install": True},
}
})
.build()
)
# --8<-- [end:conda_cluster_inline]
assert cluster.kuberay.ray_init_kwargs["runtime_env"]["conda"]["channels"] == ["conda-forge"]
Loading
Loading