diff --git a/docs.json b/docs.json index 2607d64..5473d85 100644 --- a/docs.json +++ b/docs.json @@ -98,7 +98,7 @@ "user-events/custom" ] }, - "features/triggers", + "features/automations", "features/embedded-analytics" ] }, @@ -465,9 +465,9 @@ "group": "Traces", "pages": [ "api-reference/traces/overview", - "api-reference/traces/get-trace-details", + "api-reference/traces/get-trace", "api-reference/traces/get-thread-details", - "api-reference/traces/search-traces", + "api-reference/traces/search", "api-reference/traces/create-public-trace-path", "api-reference/traces/delete-public-trace-path" ] @@ -504,9 +504,9 @@ ] }, { - "group": "Triggers", + "group": "Automations", "pages": [ - "api-reference/triggers/create-slack-trigger" + "api-reference/automations/create-slack-automation" ] }, { @@ -614,6 +614,11 @@ "tagId": "GTM-KJ4S6Z9C" } }, + "analytics": { + "posthog": { + "apiKey": "phc_oOlj3H19T2JlGbFXmrGrjSLbDPDNyPKYdIFaTdrkXOY" + } + }, "redirects": [ { "source": "/ci/cache/busted/1773936454", diff --git a/introduction.mdx b/introduction.mdx index 3ea3780..3cc27a0 100644 --- a/introduction.mdx +++ b/introduction.mdx @@ -1,6 +1,6 @@ --- title: "LangWatch: The Complete LLMOps Platform" -description: "Ship AI agents 8x faster with comprehensive observability, evaluation, and prompt optimization. Open-source platform, with over 2.5k stars on GitHub." +description: "Ship AI agents 8x faster with comprehensive observability, evaluation, and prompt optimization. Open-source platform, with over 3k stars on GitHub." sidebarTitle: Introduction keywords: langwatch, llm, ai, observability, evaluation, prompt optimization, llmops, open-source, github --- diff --git a/llms-full.txt b/llms-full.txt index b343200..8aebd68 100644 --- a/llms-full.txt +++ b/llms-full.txt @@ -4,7 +4,7 @@ --- title: "LangWatch: The Complete LLMOps Platform" -description: "Ship AI agents 8x faster with comprehensive observability, evaluation, and prompt optimization. Open-source platform, with over 2.5k stars on GitHub." +description: "Ship AI agents 8x faster with comprehensive observability, evaluation, and prompt optimization. Open-source platform, with over 3k stars on GitHub." sidebarTitle: Introduction keywords: langwatch, llm, ai, observability, evaluation, prompt optimization, llmops, open-source, github --- @@ -17,61 +17,29 @@ keywords: langwatch, llm, ai, observability, evaluation, prompt optimization, ll /> -## Quick Start -Ready to start taking control of your LLM application quality? Quick start with observability or agent simulations right away: +## Quick Start - - - - + + **Starting from scratch?** Use [Better Agents](/better-agents/overview) to scaffold a new agent project. + **Want to install skills?** Browse the [Skills Directory](/skills/directory). + + ## What is LangWatch? LangWatch is the **open-source** LLMOps platform that helps teams collaboratively debug, analyze, and iterate on their LLM applications. All platform features are natively integrated to accelerate the development workflow. @@ -80,22 +48,14 @@ Building AI applications is hard. Developers spend weeks debugging issues, optim LangWatch provides the missing operations platform for AI applications. Every LLM call, tool usage, and user interaction is automatically tracked with detailed traces, spans, and metadata. See the full conversation flow, identify bottlenecks, and understand exactly how your AI applications behave in production. +## What LangWatch Does -## For Every Role - -LangWatch serves different needs across your organization, providing value to every team member working with AI applications. - -### For Developers - -Debug faster with detailed traces that show exactly what happened in each LLM call. Build datasets from production data, run batch evaluations, and continuously improve your AI applications with comprehensive debugging tools and performance insights. - -### For Domain Experts - -Easily sift through conversations, see topics being discussed, and annotate messages for improvement in a collaborative manner with the development team. Provide feedback on AI outputs and help guide quality improvements through intuitive interfaces. - -### For Business Teams - -Track conversation metrics, user analytics, and cost tracking with custom dashboards and reporting. Monitor AI application performance, understand user behavior, and make data-driven decisions about your AI investments. + + + + + + ## Where to Start? @@ -423,6 +383,7 @@ Use LangWatch to effortlessly integrate with popular AI model providers No-code agent builders and tools + @@ -455,6 +416,10 @@ title: Quick Start mode: "wide" --- + + **Quick setup?** Instead of following these steps manually, [copy a prompt](/skills/code-prompts#instrument-my-code) into your coding agent and it will set this up for you automatically. + + LangWatch helps you understand every user interaction (**Thread**), each individual AI task (**Trace**), and all the underlying steps (**Span**) involved. We've made getting started super smooth. Let's get cracking. @@ -869,6 +834,10 @@ icon: python keywords: langchain, instrumentation, callback, langwatch, python, tracing --- + + **Quick setup?** Instead of following these steps manually, [copy a prompt](/skills/code-prompts#instrument-my-code) into your coding agent and it will set this up for you automatically. + + LangWatch integrates with Langchain to provide detailed observability into your chains, agents, LLM calls, and tool usage. ## Installation @@ -951,6 +920,10 @@ description: Instrument Agno agents with LangWatch’s Python SDK to send traces keywords: agno, openinference, langwatch, python, tracing, observability --- + + **Quick setup?** Instead of following these steps manually, [copy a prompt](/skills/code-prompts#instrument-my-code) into your coding agent and it will set this up for you automatically. + + LangWatch integrates with Agno through OpenInference instrumentation to capture traces from your Agno agents automatically. ## Installation @@ -2929,6 +2902,10 @@ icon: python keywords: langgraph, instrumentation, callback, langwatch, python, tracing --- + + **Quick setup?** Instead of following these steps manually, [copy a prompt](/skills/code-prompts#instrument-my-code) into your coding agent and it will set this up for you automatically. + + LangWatch integrates with LangGraph to provide detailed observability into your graph-based agents, LLM calls, and tool usage. ## Installation @@ -3773,6 +3750,10 @@ icon: python keywords: openai, instrumentation, autotrack, langwatch, python --- + + **Quick setup?** Instead of following these steps manually, [copy a prompt](/skills/code-prompts#instrument-my-code) into your coding agent and it will set this up for you automatically. + + LangWatch integrates with OpenAI to automatically capture detailed information about your LLM calls. ## Installation @@ -5601,6 +5582,10 @@ icon: square-js keywords: langchain, instrumentation, callback, langwatch, typescript, tracing --- + + **Quick setup?** Instead of following these steps manually, [copy a prompt](/skills/code-prompts#instrument-my-code) into your coding agent and it will set this up for you automatically. + + LangWatch integrates with Langchain to provide detailed observability into your chains, agents, LLM calls, and tool usage. ## Installation @@ -5670,6 +5655,10 @@ sidebarTitle: Vercel AI SDK keywords: vercel ai sdk, langwatch, tracing, observability, vercel, ai, sdk --- + + **Quick setup?** Instead of following these steps manually, [copy a prompt](/skills/code-prompts#instrument-my-code) into your coding agent and it will set this up for you automatically. + +
@@ -6075,6 +6064,10 @@ sidebarTitle: Mastra keywords: mastra, langwatch, tracing, observability, typescript, agent framework, ai agents --- + + **Quick setup?** Instead of following these steps manually, [copy a prompt](/skills/code-prompts#instrument-my-code) into your coding agent and it will set this up for you automatically. + + LangWatch integrates with Mastra through OpenTelemetry to capture traces from your Mastra agents automatically. ## Installation @@ -6932,7 +6925,7 @@ span.setType("llm"); span.setAttributes({ "langwatch.user.id": "user-123", "langwatch.thread.id": "thread-456", - "langwatch.streaming": false, + "langwatch.gen_ai.streaming": false, }); ``` @@ -6957,7 +6950,7 @@ await tracer.withActiveSpan("llm-operation", async (span) => { "langwatch.span.type": "llm", "langwatch.user.id": "user-123", "langwatch.thread.id": "thread-456", - "langwatch.streaming": false, + "langwatch.gen_ai.streaming": false, // ... more attributes with autocomplete }); }); @@ -6992,11 +6985,11 @@ LangWatch provides a comprehensive set of custom attributes for LLM-specific obs | `langwatch.user.id` | string | User identifier | `"user-123"` | | `langwatch.thread.id` | string | Conversation thread identifier | `"thread-456"` | | `langwatch.customer.id` | string | Customer identifier | `"customer-789"` | -| `langwatch.streaming` | boolean | Whether the operation involves streaming | `true`, `false` | +| `langwatch.gen_ai.streaming` | boolean | Whether the operation involves streaming | `true`, `false` | | `langwatch.input` | string/object | Input data for the span | `"Hello, how are you?"` | | `langwatch.output` | string/object | Output data from the span | `"I'm doing well, thank you!"` | | `langwatch.contexts` | array | RAG contexts for retrieval-augmented generation | Array of document contexts | -| `langwatch.tags` | array | Tags for categorizing spans | `["chat", "greeting"]` | +| `langwatch.labels` | array | Labels for categorizing spans | `["chat", "greeting"]` | | `langwatch.params` | object | Parameter data for operations | `{ temperature: 0.7 }` | | `langwatch.metrics` | object | Custom metrics data | `{ response_time: 1250 }` | | `langwatch.timestamps` | object | Timing information for events | `{ start: 1234567890 }` | @@ -7034,6 +7027,90 @@ LangWatch provides a comprehensive set of custom attributes for LLM-specific obs | `langwatch.langchain.run.tags` | array | Run-specific tags | `["production", "chain"]` | | `langwatch.langchain.tags` | array | LangChain operation tags | `["langchain", "llm"]` | +### Using SDK Constants + +Instead of using raw attribute strings, both SDKs provide typed constants you can import: + + + +```typescript TypeScript +import { attributes } from "langwatch"; + +span.setAttributes({ + [attributes.ATTR_LANGWATCH_SPAN_TYPE]: "llm", + [attributes.ATTR_LANGWATCH_USER_ID]: "user-123", + [attributes.ATTR_LANGWATCH_THREAD_ID]: "thread-456", + [attributes.ATTR_LANGWATCH_LABELS]: ["chat", "greeting"], + [attributes.ATTR_LANGWATCH_STREAMING]: false, +}); +``` + +```python Python +from langwatch.attributes import AttributeKey + +span.set_attribute(AttributeKey.LangWatchSpanType, "llm") +span.set_attribute(AttributeKey.LangWatchCustomerId, "customer-789") +span.set_attribute(AttributeKey.LangWatchThreadId, "thread-456") +span.set_attribute(AttributeKey.LangWatchPromptHandle, "customer-support-greeting") +``` + + + + + +**TypeScript** — `import { attributes } from "langwatch"` + +| Constant | Value | +|----------|-------| +| `ATTR_LANGWATCH_INPUT` | `langwatch.input` | +| `ATTR_LANGWATCH_OUTPUT` | `langwatch.output` | +| `ATTR_LANGWATCH_SPAN_TYPE` | `langwatch.span.type` | +| `ATTR_LANGWATCH_RAG_CONTEXTS` | `langwatch.contexts` | +| `ATTR_LANGWATCH_METRICS` | `langwatch.metrics` | +| `ATTR_LANGWATCH_SDK_VERSION` | `langwatch.sdk.version` | +| `ATTR_LANGWATCH_SDK_NAME` | `langwatch.sdk.name` | +| `ATTR_LANGWATCH_SDK_LANGUAGE` | `langwatch.sdk.language` | +| `ATTR_LANGWATCH_TIMESTAMPS` | `langwatch.timestamps` | +| `ATTR_LANGWATCH_EVALUATION_CUSTOM` | `langwatch.evaluation.custom` | +| `ATTR_LANGWATCH_PARAMS` | `langwatch.params` | +| `ATTR_LANGWATCH_CUSTOMER_ID` | `langwatch.customer.id` | +| `ATTR_LANGWATCH_THREAD_ID` | `langwatch.thread.id` | +| `ATTR_LANGWATCH_USER_ID` | `langwatch.user.id` | +| `ATTR_LANGWATCH_LABELS` | `langwatch.labels` | +| `ATTR_LANGWATCH_STREAMING` | `langwatch.gen_ai.streaming` | +| `ATTR_LANGWATCH_PROMPT_ID` | `langwatch.prompt.id` | +| `ATTR_LANGWATCH_PROMPT_HANDLE` | `langwatch.prompt.handle` | +| `ATTR_LANGWATCH_PROMPT_VERSION_ID` | `langwatch.prompt.version.id` | +| `ATTR_LANGWATCH_PROMPT_VERSION_NUMBER` | `langwatch.prompt.version.number` | +| `ATTR_LANGWATCH_PROMPT_SELECTED_ID` | `langwatch.prompt.selected.id` | +| `ATTR_LANGWATCH_PROMPT_VARIABLES` | `langwatch.prompt.variables` | + +**Python** — `from langwatch.attributes import AttributeKey` + +| Constant | Value | +|----------|-------| +| `AttributeKey.LangWatchInput` | `langwatch.input` | +| `AttributeKey.LangWatchOutput` | `langwatch.output` | +| `AttributeKey.LangWatchSpanType` | `langwatch.span.type` | +| `AttributeKey.LangWatchRAGContexts` | `langwatch.rag_contexts` | +| `AttributeKey.LangWatchMetrics` | `langwatch.metrics` | +| `AttributeKey.LangWatchSDKVersion` | `langwatch.sdk.version` | +| `AttributeKey.LangWatchSDKName` | `langwatch.sdk.name` | +| `AttributeKey.LangWatchSDKLanguage` | `langwatch.sdk.language` | +| `AttributeKey.LangWatchTimestamps` | `langwatch.timestamps` | +| `AttributeKey.LangWatchEventEvaluationCustom` | `langwatch.evaluation.custom` | +| `AttributeKey.LangWatchParams` | `langwatch.params` | +| `AttributeKey.LangWatchCustomerId` | `langwatch.customer.id` | +| `AttributeKey.LangWatchThreadId` | `langwatch.thread.id` | +| `AttributeKey.LangWatchPromptId` | `langwatch.prompt.id` | +| `AttributeKey.LangWatchPromptHandle` | `langwatch.prompt.handle` | +| `AttributeKey.LangWatchPromptVersionId` | `langwatch.prompt.version.id` | +| `AttributeKey.LangWatchPromptVersionNumber` | `langwatch.prompt.version.number` | +| `AttributeKey.LangWatchPromptSelectedId` | `langwatch.prompt.selected.id` | +| `AttributeKey.LangWatchPromptVariables` | `langwatch.prompt.variables` | + + + ## Best Practices ### Attribute Naming @@ -7062,15 +7139,15 @@ Use appropriate data types and formats: ```typescript // ✅ Good: Proper data types span.setAttributes({ - "langwatch.streaming": false, // boolean + "langwatch.gen_ai.streaming": false, // boolean "langwatch.user.id": "user-123", // string "langwatch.prompt.version.number": 2, // number - "langwatch.tags": ["chat", "greeting"], // array + "langwatch.labels": ["chat", "greeting"], // array }); // ❌ Avoid: Inconsistent data types span.setAttributes({ - "langwatch.streaming": "false", // string instead of boolean + "langwatch.gen_ai.streaming": "false", // string instead of boolean "langwatch.prompt.version.number": "2", // string instead of number }); ``` @@ -11196,11 +11273,15 @@ Go to your LangWatch project **Settings** page and copy your API key. The API ke -### Cursor +### Claude Code -1. Open Cursor Settings -2. Navigate to the **Tools and MCP** section in the sidebar -3. Add the LangWatch MCP server: +Run this command to add the MCP server: + +```bash +claude mcp add langwatch -- npx -y @langwatch/mcp-server --apiKey your-api-key-here +``` + +Or add it manually to your `~/.claude.json`: ```json { @@ -11216,15 +11297,32 @@ Go to your LangWatch project **Settings** page and copy your API key. The API ke } ``` -### Claude Code +See the [Claude Code MCP documentation](https://code.claude.com/docs/en/mcp#plugin-provided-mcp-servers) for more details. -Run this command to add the MCP server: -```bash -claude mcp add langwatch -- npx -y @langwatch/mcp-server --apiKey your-api-key-here +### Copilot + +Add to `.vscode/mcp.json` in your project (or use **MCP: Add Server** from the Command Palette): + +```json +{ + "servers": { + "langwatch": { + "type": "stdio", + "command": "npx", + "args": ["-y", "@langwatch/mcp-server"], + "env": { "LANGWATCH_API_KEY": "your-api-key-here" } + } + } +} ``` -Or add it manually to your `~/.claude.json`: + +### Cursor + +1. Open Cursor Settings +2. Navigate to the **Tools and MCP** section in the sidebar +3. Add the LangWatch MCP server: ```json { @@ -11240,9 +11338,28 @@ Or add it manually to your `~/.claude.json`: } ``` -See the [Claude Code MCP documentation](https://code.claude.com/docs/en/mcp#plugin-provided-mcp-servers) for more details. -### Other Editors +### ChatGPT + +1. Go to **Settings → Connectors** +2. Click **Add connector** +3. Enter the server URL: `https://mcp.langwatch.ai/sse` +4. For authentication, select **Bearer Token** and enter your LangWatch API key (get one at [app.langwatch.ai/authorize](https://app.langwatch.ai/authorize)) + +*Requires a Plus or Team plan.* + + +### Claude Chat + +1. Go to **Settings → Connectors** +2. Click **Add custom connector** +3. Enter the server URL: `https://mcp.langwatch.ai/sse` +4. Click **Advanced settings**, select **Bearer Token** auth, and enter your LangWatch API key (get one at [app.langwatch.ai/authorize](https://app.langwatch.ai/authorize)) + +*Requires a Pro or Max plan.* + + +### Other For other MCP-compatible editors, add the following configuration to your MCP settings file: @@ -11276,6 +11393,13 @@ Open your AI assistant chat (e.g., `Cmd/Ctrl + I` in Cursor, or `Cmd/Ctrl + Shif | `LANGWATCH_API_KEY` | `--apiKey` | API key for authentication | | `LANGWATCH_ENDPOINT` | `--endpoint` | API endpoint (default: `https://app.langwatch.ai`) | +### Two Modes + +The MCP server runs in two modes: + +- **Local (stdio)**: Default. Runs as a subprocess of your coding assistant (Claude Code, Copilot, Cursor). API key set via `--apiKey` flag or `LANGWATCH_API_KEY` env var. +- **Remote (HTTP/SSE)**: For web-based assistants (ChatGPT, Claude Chat). Hosted at `https://mcp.langwatch.ai`. API key sent as `Authorization: Bearer ` per session — each user brings their own key. + ## Usage Examples ### Write Agent Tests with Scenario @@ -13049,6 +13173,10 @@ sidebarTitle: Introduction keywords: langwatch, agent simulations, agent testing, agent development, agent development, agent testing --- + + **Quick setup?** [Copy the scenarios prompt](/skills/code-prompts#add-scenario-tests) into your coding agent to add simulation tests automatically. + + # What are Agent Simulations? Agent simulations are a powerful approach to testing AI agents that goes beyond traditional evaluation methods. Unlike static input-output testing, simulations test your agent's behavior in realistic, multi-turn conversations that mimic how real users would interact with your system. @@ -13161,6 +13289,10 @@ The rest of this documentation will show you how to use LangWatch's simulation v title: Getting Started --- + + **Quick setup?** [Copy the scenarios prompt](/skills/code-prompts#add-scenario-tests) into your coding agent to add simulation tests automatically. + + This guide will walk you through the basic setup required to run your first simulation and see the results in LangWatch. For more in-depth information and advanced use cases, please refer to the official [`scenario` library documentation](https://github.com/langwatch/scenario). @@ -13446,655 +13578,140 @@ From here, you can click on a set to view its detailed history of batch runs. --- -# FILE: ./llm-evaluation/offline/platform/ci-cd-execution.mdx +# FILE: ./datasets/ai-dataset-generation.mdx --- -title: Run Evaluations from CI/CD -sidebarTitle: CI/CD Execution -description: Execute platform-configured evaluations from your CI/CD pipelines using the LangWatch SDKs or REST API. +title: Generating a dataset with AI +description: Generate datasets with AI to bootstrap LLM evaluations, regression tests, and simulation-based agent testing. --- -Run evaluations that you've configured in the LangWatch platform directly from your CI/CD pipelines. This enables automated quality gates for your LLM applications. - -## Overview +Getting started with evaluations can be a bit daunting, especially when you don't have a dataset to use yet. -After configuring an evaluation in LangWatch (setting up targets, evaluators, and datasets), you can trigger it programmatically using: +LangWatch allows you to generate sample datasets with our built-in AI data generator inside the Evaluation Wizard. -- **Python SDK**: `langwatch.experiment.run("your-slug")` -- **TypeScript SDK**: `langwatch.experiments.run("your-slug")` -- **REST API**: `POST /api/evaluations/v3/{slug}/run` +In the video below, we showcase the process of creating an evaluation for a Business Coaching Agent, using the AI data generator to bootstrap the dataset: -The execution uses the configuration saved in LangWatch, so you don't need to specify targets, evaluators, or datasets in your CI/CD script. + + + -## Quickstart +--- -### 1. Find Your Evaluation Slug +# FILE: ./datasets/automatically-from-traces.mdx -Your evaluation slug is visible in the URL when viewing your evaluation: -``` -https://app.langwatch.ai/your-project/evaluations/v3/your-evaluation-slug - ^^^^^^^^^^^^^^^^^^^^^^^^^ -``` +--- +title: Automatically build datasets from real-time traces +description: Automatically build datasets from real-time traces to power LLM evaluations, regression tests, and AI agent testing workflows. +--- -You can also find it by clicking the **CI/CD** button in the evaluation toolbar. +You can keep continously populating the dataset with new data arriving from production by using **Automations**, mapping trace fields to any dataset columns you prefer. -### 2. Run from Your Pipeline +Simply go to the Messages page and select a filter (for example, by model), the Add Automation button will be enabled: +
+ + LangWatch + +
+For Action, select **Add To Dataset**, and chose the right fields to map from the trace to the dataset: - ### Python + +LangWatch + +Hit save, and that's it! Now every time a new message matches the filter, the automation will be fired and the dataset will be populated with the new row. -```python -import langwatch +--- -result = langwatch.experiment.run("your-evaluation-slug") -result.print_summary() -``` +# FILE: ./datasets/overview.mdx - ### TypeScript +--- +title: Datasets +sidebarTitle: Overview +description: Create and manage datasets in LangWatch to build evaluation sets for LLMs and structured AI agent testing. +--- -```typescript -import { LangWatch } from "langwatch"; + + **Let your agent set this up.** [Copy the evaluations prompt](/skills/code-prompts#set-up-evaluations) into your coding agent to get started automatically. + -const langwatch = new LangWatch(); +## Create datasets -const result = await langwatch.experiments.run("your-evaluation-slug"); -result.printSummary(); -``` +LangWatch allows you to create and manage datasets, with a built-in excel-like interface for collaborating with your team. - ### curl +* Import datasets in any format you want, manage columns and data types +* Keep populating the dataset with data traced from production +* Create new datasets from scratch with AI assistance +* Generate synthetic data from documents +* Import, export and manage versions -```bash -# Start the evaluation run -RUN_RESPONSE=$(curl -s -X POST "https://app.langwatch.ai/api/evaluations/v3/your-evaluation-slug/run" \ - -H "X-Auth-Token: ${LANGWATCH_API_KEY}") +### Usage -RUN_ID=$(echo $RUN_RESPONSE | jq -r '.runId') -echo "Started run: $RUN_ID" +To create a dataset, simply go to the datasets page and click the "Upload or Create Dataset" button. You will be able to select the type of dataset you want as well as the columns you want to include. -# Poll for completion -while true; do - STATUS_RESPONSE=$(curl -s "https://app.langwatch.ai/api/evaluations/v3/runs/$RUN_ID" \ - -H "X-Auth-Token: ${LANGWATCH_API_KEY}") + +LangWatch + +## Adding data - STATUS=$(echo $STATUS_RESPONSE | jq -r '.status') - PROGRESS=$(echo $STATUS_RESPONSE | jq -r '.progress') - TOTAL=$(echo $STATUS_RESPONSE | jq -r '.total') +There are a couple ways to add data to a dataset; - echo "Progress: $PROGRESS/$TOTAL" +- **Manually**: You can add data on a per message basis. +- **From traces**: You can fill the dataset by selecting a group of messages already captured. +- **CSV Upload**: You can fill the dataset by uploading a CSV file. +- **Continuously populate**: You can continuously populate the dataset with data traced from production. - if [ "$STATUS" = "completed" ] || [ "$STATUS" = "failed" ]; then - break - fi +### Manually - sleep 2 -done +To add data manually, click the "Add to Dataset" button on the messages page after selecting a message. You will then be able to choose the dataset type and preview the data that will be added. -# Show summary and exit -echo $STATUS_RESPONSE | jq '.summary' + +LangWatch + -if [ "$STATUS" = "failed" ]; then - exit 1 -fi -``` +### From traces +To add data by selecting a group, simply click the "Add to Dataset" button after choosing the desired messages in the table view. You'll then be able to select the type of dataset you wish to add to and preview the data that will be included. + +LangWatch + - -Set the `LANGWATCH_API_KEY` environment variable with your project API key. -You can find it in your [project settings](/project/setup). - +### Continuously -## CI/CD Integration Examples +You can keep continuously populating the dataset with new data arriving from production by using **Automations**. See [Automatically building a dataset from traces](/datasets/automatically-from-traces) for more details. -### GitHub Actions -```yaml -name: LLM Evaluation +### CSV Upload -on: - pull_request: - branches: [main] - workflow_dispatch: - -jobs: - evaluate: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.11' - - - name: Install dependencies - run: pip install langwatch - - - name: Run evaluation - env: - LANGWATCH_API_KEY: ${{ secrets.LANGWATCH_API_KEY }} - run: | - python -c " - import langwatch - - result = langwatch.experiment.run('my-evaluation') - result.print_summary() - " -``` - -### GitLab CI - -```yaml -evaluate: - stage: test - image: python:3.11 - script: - - pip install langwatch - - | - python -c " - import langwatch - - result = langwatch.experiment.run('my-evaluation') - result.print_summary() - " - variables: - LANGWATCH_API_KEY: $LANGWATCH_API_KEY -``` - -### CircleCI - -```yaml -version: 2.1 - -jobs: - evaluate: - docker: - - image: python:3.11 - steps: - - checkout - - run: - name: Run evaluation - command: | - pip install langwatch - python -c " - import langwatch - - result = langwatch.experiment.run('my-evaluation') - result.print_summary() - " -``` - -## Options - -### Progress Callback - -Track progress during long-running evaluations: - - - ### Python - -```python -result = langwatch.experiment.run( - "my-evaluation", - on_progress=lambda completed, total: print(f"Progress: {completed}/{total}") -) -result.print_summary() -``` - - ### TypeScript - -```typescript -const result = await langwatch.experiments.run("my-evaluation", { - onProgress: (completed, total) => { - console.log(`Progress: ${completed}/${total}`); - } -}); -result.printSummary(); -``` - - - -### Timeout - -Set a maximum time to wait for completion: - - - ### Python - -```python -result = langwatch.experiment.run( - "my-evaluation", - timeout=300.0 # 5 minutes (default: 600 seconds) -) -result.print_summary() -``` - - ### TypeScript - -```typescript -const result = await langwatch.experiments.run("my-evaluation", { - timeout: 300000 // 5 minutes in ms (default: 600000) -}); -result.printSummary(); -``` - - - -### Poll Interval - -Adjust how frequently to check for completion: - - - ### Python - -```python -result = langwatch.experiment.run( - "my-evaluation", - poll_interval=5.0 # Check every 5 seconds (default: 2 seconds) -) -result.print_summary() -``` - - ### TypeScript - -```typescript -const result = await langwatch.experiments.run("my-evaluation", { - pollInterval: 5000 // Check every 5 seconds in ms (default: 2000) -}); -result.printSummary(); -``` - - - -### Exit on Failure - -By default, `print_summary()` / `printSummary()` exits with code 1 when there are failures. You can disable this: - - - ### Python - -```python -result = langwatch.experiment.run("my-evaluation") -result.print_summary(exit_on_failure=False) # Don't exit automatically - -# Handle failures manually -if result.failed > 0: - print(f"Warning: {result.failed} failures, but continuing...") -``` - - ### TypeScript - -```typescript -const result = await langwatch.experiments.run("my-evaluation"); -result.printSummary(false); // Don't exit automatically - -// Handle failures manually -if (result.failed > 0) { - console.log(`Warning: ${result.failed} failures, but continuing...`); -} -``` - - - -## Results Summary - -The `print_summary()` / `printSummary()` method outputs a CI-friendly summary: - -``` -════════════════════════════════════════════════════════════ - EVALUATION RESULTS -════════════════════════════════════════════════════════════ - Run ID: run_abc123 - Status: COMPLETED - Duration: 45.2s -──────────────────────────────────────────────────────────── - Passed: 42 - Failed: 3 - Pass Rate: 93.3% -──────────────────────────────────────────────────────────── - TARGETS: - GPT-4o: 20 passed, 2 failed - Avg latency: 1250ms - Total cost: $0.0125 - Claude 3.5: 22 passed, 1 failed - Avg latency: 980ms - Total cost: $0.0098 -──────────────────────────────────────────────────────────── - EVALUATORS: - Exact Match: 85.0% pass rate - Faithfulness: 95.0% pass rate - Avg score: 0.87 -──────────────────────────────────────────────────────────── - View details: https://app.langwatch.ai/project/experiments/my-eval?runId=run_abc123 -════════════════════════════════════════════════════════════ -``` - -## Result Object - -The result object contains detailed information about the run: - - - ### Python - -```python -result = langwatch.experiment.run("my-evaluation") - -# Basic metrics -result.run_id # Unique run identifier -result.status # "completed", "failed", or "stopped" -result.passed # Number of passed evaluations -result.failed # Number of failed evaluations -result.pass_rate # Percentage passed (0-100) -result.duration # Total duration in milliseconds -result.run_url # URL to view in LangWatch - -# Detailed summary -result.summary.total_cells # Total cells executed -result.summary.completed_cells # Successfully completed -result.summary.failed_cells # Failed executions -result.summary.targets # Per-target statistics -result.summary.evaluators # Per-evaluator statistics - -# Print and exit on failure -result.print_summary() -``` - - ### TypeScript - -```typescript -const result = await langwatch.experiments.run("my-evaluation"); - -// Basic metrics -result.runId // Unique run identifier -result.status // "completed" | "failed" | "stopped" -result.passed // Number of passed evaluations -result.failed // Number of failed evaluations -result.passRate // Percentage passed (0-100) -result.duration // Total duration in milliseconds -result.runUrl // URL to view in LangWatch - -// Detailed summary -result.summary.totalCells // Total cells executed -result.summary.completedCells // Successfully completed -result.summary.failedCells // Failed executions -result.summary.targets // Per-target statistics -result.summary.evaluators // Per-evaluator statistics - -// Print and exit on failure -result.printSummary(); -``` - - - -## Error Handling - - - ### Python - -```python -from langwatch.evaluation import ( - EvaluationNotFoundError, - EvaluationTimeoutError, - EvaluationRunFailedError, - EvaluationsApiError, -) - -try: - result = langwatch.experiment.run("my-evaluation", timeout=300) - result.print_summary() -except EvaluationNotFoundError: - print("Evaluation slug not found") - exit(1) -except EvaluationTimeoutError as e: - print(f"Timeout: {e.progress}/{e.total} completed") - exit(1) -except EvaluationRunFailedError as e: - print(f"Run failed: {e.error_message}") - exit(1) -except EvaluationsApiError as e: - print(f"API error: {e} (status: {e.status_code})") - exit(1) -``` - - ### TypeScript - -```typescript -import { - EvaluationNotFoundError, - EvaluationTimeoutError, - EvaluationRunFailedError, - EvaluationsApiError, -} from "langwatch"; - -try { - const result = await langwatch.experiments.run("my-evaluation", { timeout: 300000 }); - result.printSummary(); -} catch (error) { - if (error instanceof EvaluationNotFoundError) { - console.error("Evaluation slug not found"); - } else if (error instanceof EvaluationTimeoutError) { - console.error(`Timeout: ${error.progress}/${error.total} completed`); - } else if (error instanceof EvaluationRunFailedError) { - console.error(`Run failed: ${error.errorMessage}`); - } else if (error instanceof EvaluationsApiError) { - console.error(`API error: ${error.message} (status: ${error.statusCode})`); - } - process.exit(1); -} -``` - - - -## REST API Reference - -### Start a Run - -``` -POST /api/evaluations/v3/{slug}/run -``` - -**Headers:** -- `X-Auth-Token: your-api-key` or `Authorization: Bearer your-api-key` - -**Response:** -```json -{ - "runId": "run_abc123", - "status": "running", - "total": 45, - "runUrl": "https://app.langwatch.ai/project/experiments/my-eval?runId=run_abc123" -} -``` - -### Get Run Status - -``` -GET /api/evaluations/v3/runs/{runId} -``` - -**Headers:** -- `X-Auth-Token: your-api-key` or `Authorization: Bearer your-api-key` - -**Response (running):** -```json -{ - "runId": "run_abc123", - "status": "running", - "progress": 20, - "total": 45, - "startedAt": 1702500000000 -} -``` - -**Response (completed):** -```json -{ - "runId": "run_abc123", - "status": "completed", - "progress": 45, - "total": 45, - "startedAt": 1702500000000, - "finishedAt": 1702500045000, - "summary": { - "runId": "run_abc123", - "totalCells": 45, - "completedCells": 45, - "failedCells": 3, - "duration": 45000, - "runUrl": "https://app.langwatch.ai/project/experiments/my-eval?runId=run_abc123" - } -} -``` - -## What's Next? - - - - Configure your first evaluation in LangWatch - - - Write evaluations directly in code - - - Browse available evaluation metrics - - - Learn about dataset management - - - ---- - -# FILE: ./datasets/ai-dataset-generation.mdx - ---- -title: Generating a dataset with AI -description: Generate datasets with AI to bootstrap LLM evaluations, regression tests, and simulation-based agent testing. ---- - -Getting started with evaluations can be a bit daunting, especially when you don't have a dataset to use yet. - -LangWatch allows you to generate sample datasets with our built-in AI data generator inside the Evaluation Wizard. - -In the video below, we showcase the process of creating an evaluation for a Business Coaching Agent, using the AI data generator to bootstrap the dataset: - - - - - ---- - -# FILE: ./datasets/automatically-from-traces.mdx - ---- -title: Automatically build datasets from real-time traces -description: Automatically build datasets from real-time traces to power LLM evaluations, regression tests, and AI agent testing workflows. ---- - -You can keep continously populating the dataset with new data arriving from production by using **Automations**, mapping trace fields to any dataset columns you prefer. - -Simply go to the Messages page and select a filter (for example, by model), the Add Trigger button will be enabled: - -
- - LangWatch - -
-For Action, select **Add To Dataset**, and chose the right fields to map from the trace to the dataset: - - -LangWatch - -Hit save, and that's it! Now every time a new message matches the filter, the trigger will be fired and the dataset will be populated with the new row. - ---- - -# FILE: ./datasets/overview.mdx - ---- -title: Datasets -sidebarTitle: Overview -description: Create and manage datasets in LangWatch to build evaluation sets for LLMs and structured AI agent testing. ---- - -## Create datasets - -LangWatch allows you to create and manage datasets, with a built-in excel-like interface for collaborating with your team. - -* Import datasets in any format you want, manage columns and data types -* Keep populating the dataset with data traced from production -* Create new datasets from scratch with AI assistance -* Generate synthetic data from documents -* Import, export and manage versions - -### Usage - -To create a dataset, simply go to the datasets page and click the "Upload or Create Dataset" button. You will be able to select the type of dataset you want as well as the columns you want to include. - - -LangWatch - -## Adding data - -There are a couple ways to add data to a dataset; - -- **Manually**: You can add data on a per message basis. -- **From traces**: You can fill the dataset by selecting a group of messages already captured. -- **CSV Upload**: You can fill the dataset by uploading a CSV file. -- **Continuously populate**: You can continuously populate the dataset with data traced from production. - -### Manually - -To add data manually, click the "Add to Dataset" button on the messages page after selecting a message. You will then be able to choose the dataset type and preview the data that will be added. - - -LangWatch - - -### From traces - -To add data by selecting a group, simply click the "Add to Dataset" button after choosing the desired messages in the table view. You'll then be able to select the type of dataset you wish to add to and preview the data that will be included. - - -LangWatch - - -### Continuously - -You can keep continuously populating the dataset with new data arriving from production by using **Automations**. See [Automatically building a dataset from traces](/datasets/automatically-from-traces) for more details. - - -### CSV Upload - -To add data by CSV upload, go to your datasets page and select the dataset you want to update. Click the "Upload CSV" button and upload your CSV or JSONL file. You can then map the columns from your file to the appropriate fields in the dataset based on the dataset type. +To add data by CSV upload, go to your datasets page and select the dataset you want to update. Click the "Upload CSV" button and upload your CSV or JSONL file. You can then map the columns from your file to the appropriate fields in the dataset based on the dataset type. - + + **Let your agent set this up.** [Copy the evaluations prompt](/skills/code-prompts#set-up-evaluations) into your coding agent to get started automatically. + + Online evaluation lets you continuously score your LLM's production traffic. Unlike [experiments](/evaluations/experiments/overview) which test before deployment, online evaluation monitors your live application to catch quality issues, detect regressions, and ensure safety. @@ -16327,7 +15948,7 @@ trace.end(); Once enabled, scores will appear on: - **Traces** - Individual trace scores visible in trace details - **Analytics** - Aggregate metrics over time -- **Alerts** - Configure triggers for low scores +- **Alerts** - Configure automations for low scores ## Adding Scores via Code @@ -16420,6 +16041,10 @@ sidebarTitle: Overview description: Block or modify harmful LLM responses in real-time to enforce safety and policy constraints. --- + + **Let your agent set this up.** [Copy the evaluations prompt](/skills/code-prompts#set-up-evaluations) into your coding agent to get started automatically. + + Guardrails are evaluators that run in real-time and **act** on the results - blocking, modifying, or rejecting responses that violate your safety or policy rules. Unlike [monitors](/evaluations/online-evaluation/overview) which only measure and alert, guardrails actively prevent harmful content from reaching users. ## Guardrails vs Monitors @@ -16649,6 +16274,10 @@ sidebarTitle: Code Integration description: Add guardrails to your LLM application to block harmful content in real-time. --- + + **Let your agent set this up.** [Copy the evaluations prompt](/skills/code-prompts#set-up-evaluations) into your coding agent to get started automatically. + + This guide shows how to integrate guardrails into your application using the LangWatch SDK. Guardrails run evaluators synchronously and return results you can act on immediately. ## Basic Usage @@ -17101,6 +16730,10 @@ sidebarTitle: Overview description: Ensure quality and safety for your LLM applications with experiments, online evaluation, guardrails, and evaluators. --- + + **Let your agent set this up.** [Copy the evaluations prompt](/skills/code-prompts#set-up-evaluations) into your coding agent to get started automatically. + + LangWatch provides comprehensive evaluations tools for your LLM applications. Whether you're evaluating before deployment or monitoring in production, we have you covered. ## The Agent Evaluation Lifecycle @@ -17251,6 +16884,10 @@ sidebarTitle: Via SDK description: Run experiments programmatically from notebooks or scripts to batch test your LLM applications. --- + + **Let your agent set this up.** [Copy the evaluations prompt](/skills/code-prompts#set-up-evaluations) into your coding agent to get started automatically. + + LangWatch makes it easy to run experiments from code. Just add a few lines to start tracking your experiments. @@ -18218,6 +17855,10 @@ sidebarTitle: Overview description: Run batch tests on your LLM applications to measure quality, compare configurations, and catch regressions before production. --- + + **Let your agent set this up.** [Copy the evaluations prompt](/skills/code-prompts#set-up-evaluations) into your coding agent to get started automatically. + + Experiments let you systematically test your LLM applications before deploying to production. Run your prompts, models, or agents against datasets and measure quality with evaluators. ## What is an Experiment? @@ -18386,6 +18027,312 @@ Learn more about [CI/CD integration](/evaluations/experiments/ci-cd). --- +# FILE: ./evaluations/experiments/multimodal-evaluation.mdx + +--- +title: Multimodal Evaluation — Images, PDFs, and Vision +sidebarTitle: Multimodal Evaluation +description: Evaluate image generation, document parsing, and other multimodal AI pipelines with LLM-as-a-Judge vision models. +--- + +LangWatch supports multimodal evaluation out of the box. You can evaluate image inputs and outputs using any vision-capable model (GPT-4o, GPT-5.2, Claude Sonnet, Gemini, etc.) through the built-in LLM-as-a-Judge evaluators — no custom code required. + +This covers common multimodal use cases: +- **Image generation quality** — score outputs of image generation models +- **Document parsing** — evaluate extracted metadata from PDFs and scanned documents +- **Content moderation** — detect NSFW or low-quality uploaded images +- **Visual QA** — evaluate answers to questions about images +- **Image comparison** — compare generated outputs against reference images + + +**Image support works with all three LLM-as-a-Judge evaluator types:** +- **Boolean** — pass/fail evaluation (e.g. "Is the generated image photorealistic?") +- **Score** — numeric score evaluation (e.g. "Rate image quality from 1-5") +- **Category** — classification evaluation (e.g. "Classify the image as: excellent / good / poor") + +**See also:** +- [Dataset Images](/datasets/dataset-images) — Setting up image columns in datasets +- [Saved Evaluators](/evaluations/evaluators/saved-evaluators) — Reuse evaluators via API + + +## Supported Image Formats + +Images can be provided in any of these formats: + +| Format | Example | +|--------|---------| +| **Image URL** | `https://example.com/photo.png` | +| **Base64 data URI** | `data:image/png;base64,iVBORw0KGgo...` | +| **Markdown image** | `![alt text](https://example.com/photo.png)` | + +Supported extensions: `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.svg`, `.bmp`, `.tiff` + + +Image detection is strict by design — a field is treated as an image only when the **entire value** is an image URL or base64 string. Mixed text-and-image content is sent as plain text. This prevents unintended multipart content when a field happens to contain an image URL as part of a longer string. + + +## Evaluating Images via UI + +### Step 1: Create a Dataset with Image Columns + +1. Go to **Evaluations** → **New Evaluation** → **Create Experiment** +2. Click **+** next to the Datasets header to create a new dataset +3. Add columns and set their type to **image** using the column type dropdown + +
+ + Column type selector showing image option + +
+ +4. Paste image URLs or base64 data URIs into the cells — the workbench renders them inline with click-to-expand + +### Step 2: Add an LLM-as-a-Judge Evaluator + +1. Click **+ Add evaluator** on a row in the evaluators section +2. Select an **LLM-as-a-Judge** evaluator (Boolean, Score, or Category) +3. Choose a **vision-capable model** (e.g. `gpt-5.2`, `claude-sonnet-4-5-20250929`) +4. Write a prompt that references the image fields — map dataset columns to the evaluator's `input`, `output`, `contexts`, or `expected_output` variables + +The evaluator automatically detects image values and sends them as multipart content to the vision model. No special configuration needed. + + + LangWatch experiments workbench showing image evaluation with LLM-as-a-Judge score evaluator + + +In this example, a virtual try-on pipeline is evaluated with three image columns: +- **original** → mapped to `contexts` (the person's photo) +- **request** → mapped to `input` (the clothing item) +- **generated** → mapped to `output` (the try-on result) + +The LLM-as-a-Judge prompt instructs the model to evaluate all three images and score the quality of the generated output. + +### Step 3: Run and Iterate + +Click the **play button** to run the evaluator. The model receives all images as vision content and returns structured results (score, pass/fail, or category) with detailed reasoning. + +Use this workflow to **iterate on your evaluator prompt** until you have reliable evaluation criteria, then save it for reuse across experiments and CI/CD pipelines. + +## Custom Workflow Evaluators for Complex Logic + +For more advanced evaluation pipelines, you can create a **Custom Workflow Evaluator** in the Evaluators page. This gives you a visual workflow builder where you can chain multiple LLM nodes, add image variables to prompts, and build multi-step evaluation logic. + + + LangWatch custom workflow evaluator showing image variables in prompt template + + +In the workflow builder: +1. Add **image-typed variables** to your prompt node inputs +2. Use `{{ "{{variable_name}}" }}` syntax to reference images in the prompt template +3. Map dataset columns to the image variables in the entry node +4. The workflow handles multipart content assembly automatically + +This is useful when you need to split evaluation into multiple steps, use different models for different aspects, or combine vision evaluation with text-based checks. + +## Evaluating Images via SDK + +For programmatic evaluation from notebooks or CI/CD, use the Python or TypeScript SDK with a [saved evaluator](/evaluations/evaluators/saved-evaluators). + +### Using a Saved Evaluator + +After iterating on your evaluator in the UI, save it and call it from code: + + +```python Python +import langwatch + +df = langwatch.datasets.get_dataset("my-image-dataset").to_pandas() + +experiment = langwatch.experiment.init("image-quality-evaluation") + +for index, row in experiment.loop(df.iterrows()): + # Use your saved image evaluator + experiment.evaluate( + "evaluators/image-quality-scorer", # Your saved evaluator slug + index=index, + data={ + "input": row["request_image"], # Image URL or base64 + "output": row["generated_image"], # Image URL or base64 + "contexts": [row["original_photo"]], # List of context images + }, + ) +``` + +```typescript TypeScript +import { LangWatch } from "langwatch"; + +const langwatch = new LangWatch(); + +const dataset = await langwatch.datasets.get("my-image-dataset"); +const experiment = await langwatch.experiments.init("image-quality-evaluation"); + +await experiment.run( + dataset.entries.map((e) => e.entry), + async ({ item, index }) => { + // Use your saved image evaluator + await experiment.evaluate("evaluators/image-quality-scorer", { + index, + data: { + input: item.request_image, // Image URL or base64 + output: item.generated_image, // Image URL or base64 + contexts: [item.original_photo], // List of context images + }, + }); + }, + { concurrency: 4 } +); +``` + + +### Custom Scoring with Vision Models + +You can also call vision models directly and log custom scores: + +```python +import langwatch +import litellm + +experiment = langwatch.experiment.init("custom-image-evaluation") + +for index, row in experiment.loop(df.iterrows()): + # Call a vision model directly + response = litellm.completion( + model="gpt-4o", + messages=[{ + "role": "user", + "content": [ + {"type": "text", "text": "Rate this generated image quality from 1 to 5. Return only the number."}, + {"type": "image_url", "image_url": {"url": row["generated_image"]}}, + ], + }], + ) + + score = int(response.choices[0].message.content.strip()) + + experiment.log( + "image_quality", + index=index, + data={"output": row["generated_image"]}, + score=score / 5.0, + passed=score >= 3, + details=f"Image quality score: {score}/5", + ) +``` + +## Evaluating Document Parsing (PDFs) + +Multimodal evaluation also covers document-based pipelines. Here is an example of evaluating a PDF parsing pipeline that extracts metadata from academic papers: + +```python +import langwatch +import pandas as pd +from unstructured.partition.pdf import partition_pdf +from unstructured.staging.base import elements_to_text +import dspy + +dspy.configure(lm=dspy.LM("openai/gpt-4o-mini")) + +# Dataset of PDFs with ground truth metadata +df = pd.DataFrame([ + { + "file": "paper1.pdf", + "expected_title": "Vibe Coding vs. Agentic Coding", + "expected_authors": "Ranjan Sapkota, Konstantinos I. Roumeliotis, Manoj Karkee", + }, + # ... more rows +]) + +@langwatch.trace() +def extract_pdf_info(filename): + langwatch.get_current_trace().autotrack_dspy() + elements = partition_pdf(filename=filename) + pdf = elements_to_text(elements=elements) + return dspy.Predict( + "pdf -> title: str, author_names: str, github_link: Optional[str]" + )(pdf=pdf) + +# Run the evaluation +evaluation = langwatch.experiment.init("pdf-parsing-evaluation") + +for index, row in evaluation.loop(df.iterrows()): + response = extract_pdf_info(row["file"]) + + evaluation.log( + "author_names_accuracy", + index=index, + passed=response.author_names == row["expected_authors"], + details=f"Expected: {row['expected_authors']}, Got: {response.author_names}", + ) +``` + +## Using Evaluators via API + +Once you have a reliable image evaluator, you can call it directly via REST API for integration into any pipeline: + +```bash +curl -X POST "https://app.langwatch.ai/api/evaluations/evaluators/image-quality-scorer/evaluate" \ + -H "X-Auth-Token: $LANGWATCH_API_KEY" \ + -H "Content-Type: application/json" \ + -d @- < +Base64 image payloads can be large. The evaluator API supports request bodies up to **30 MB**. If you are working with many high-resolution images, prefer using image URLs over base64 encoding. + + +## Model Compatibility + +Image evaluation requires a **vision-capable model**. Any model supported by [litellm](https://docs.litellm.ai/docs/providers) with vision capabilities works, including: + +| Provider | Models | +|----------|--------| +| OpenAI | `gpt-4o`, `gpt-4o-mini`, `gpt-5.2` | +| Anthropic | `claude-sonnet-4-5-20250929`, `claude-opus-4-6` | +| Google | `gemini-2.0-flash`, `gemini-2.5-pro` | + + +If a non-vision model is selected, the evaluator falls back to sending plain text descriptions. For accurate image evaluation, always select a vision-capable model. + + +## Next Steps + + + + + + + + +--- + # FILE: ./evaluations/experiments/ci-cd.mdx --- @@ -19096,6 +19043,10 @@ sidebarTitle: Overview description: Understand evaluators - the scoring functions that assess your LLM outputs for quality, safety, and correctness. --- + + **Let your agent set this up.** [Copy the evaluations prompt](/skills/code-prompts#set-up-evaluations) into your coding agent to get started automatically. + + Evaluators are scoring functions that assess the quality of your LLM's outputs. They're the building blocks for [experiments](/evaluations/experiments/overview), [online evaluation](/evaluations/online-evaluation/overview), and [guardrails](/evaluations/guardrails/overview). ## Choose Your Approach @@ -19652,10 +19603,10 @@ Custom scores appear in: href="/evaluations/experiments/overview" /> @@ -19693,9 +19644,7 @@ Saved evaluators are pre-configured evaluation setups that you create on the Lan 4. Configure the settings (model, prompt, thresholds, etc.) 5. Give it a descriptive name and save - - Creating a saved evaluator - +{/* TODO: Add screenshot of saved evaluator creation UI */} ### Via the Evaluators Page @@ -23715,7 +23664,7 @@ LangWatch is composed of a few core components: - **LangWatch App**: The main application and user interface for the platform. - **LangEvals**: The evaluation service for LangWatch built-in evaluators. - **LangWatch NLP**: The service the runs workflows and custom built evaluators in the platform. -- **LangWatch Workers**: The background workers running inside the app, processing traces, triggering alerts, real-time evaluations and so on. +- **LangWatch Workers**: Background workers that process traces, trigger alerts, run real-time evaluations and so on. By default, workers are deployed as a separate pod for independent scaling. - **Redis**: A redis queue that coordinates and add resiliency for the traces to be processed. - **Elasticsearch/OpenSearch**: The main database for the platform, storing traces and evaluation results. - **PostgreSQL**: The core database for the platform, storing user accounts, teams, and other metadata. @@ -23906,7 +23855,7 @@ The primary LangWatch application that provides the web interface and core funct - API endpoints for trace collection and management - User authentication and session management - Dashboard and analytics interface -- Background workers for processing traces and evaluations +- Background workers for processing traces and evaluations (deployed as a separate pod in Helm by default) **Key Features:** - Trace collection and storage @@ -24110,8 +24059,16 @@ app: secretKeyRef: name: langwatch-secrets key: metricsApiKey - sentry: - enabled: false + +workers: + replicaCount: 2 + resources: + requests: + cpu: 500m + memory: 2Gi + limits: + cpu: 2000m + memory: 4Gi ingress: enabled: true @@ -24188,8 +24145,6 @@ app: telemetry: metrics: enabled: false - sentry: - enabled: false resources: requests: cpu: 250m @@ -24283,6 +24238,16 @@ app: name: langwatch-secrets key: metricsApiKey +workers: + replicaCount: 2 + resources: + requests: + cpu: 500m + memory: 2Gi + limits: + cpu: 2000m + memory: 4Gi + ingress: enabled: true className: nginx @@ -24546,7 +24511,7 @@ If you want to use SSO with our Cloud version, contact our team with the require | Configuration | Value | | ------------------ | ------------------------------------------------------------------------------------------- | -| Required Variables | `AZURE_AD_CLIENT_ID`
`AZURE_AD_CLIENT_SECRET`
`AZURE_AD_TENANT_ID` | +| Required Variables | `AZURE_AD_CLIENT_ID`
`AZURE_AD_CLIENT_SECRET`
`AZURE_AD_TENANT_ID`
`AZURE_AD_DOMAIN` | | Redirect URL | `https://langwatch.eu.auth0.com/login/callback` | ### Google Workspace @@ -25521,6 +25486,58 @@ openapi: 'PATCH /api/annotations/{id}' --- +--- + +# FILE: ./api-reference/saved-evaluators/overview.mdx + +--- +title: "Overview" +description: "Manage saved evaluator configurations for your project" +--- + +## Intro + +The Saved Evaluators API lets you manage reusable evaluator configurations for your project. You can list, retrieve, and create saved evaluators that can then be used for online evaluations, guardrails, and experiments. + +Each saved evaluator stores a name, an evaluator type (e.g. `langevals/exact_match`), and its settings configuration. + +## Authentication + +To make a call to the Saved Evaluators API, you will need to pass through your LangWatch API key in the header as `X-Auth-Token`. Your API key can be found on the setup page under settings. + +#### Allowed Methods + +- `GET /api/evaluators` - List all saved evaluators for a project +- `GET /api/evaluators/:idOrSlug` - Get a specific evaluator by ID or slug +- `POST /api/evaluators` - Create a new saved evaluator + +--- + +# FILE: ./api-reference/saved-evaluators/create-evaluator.mdx + +--- +title: "Create evaluator" +openapi: "POST /api/evaluators" +--- + +--- + +# FILE: ./api-reference/saved-evaluators/get-evaluators.mdx + +--- +title: "List evaluators" +openapi: "GET /api/evaluators" +--- + +--- + +# FILE: ./api-reference/saved-evaluators/get-evaluator.mdx + +--- +title: "Get evaluator" +openapi: "GET /api/evaluators/{idOrSlug}" +--- + --- # FILE: ./api-reference/endpoint/create.mdx diff --git a/llms.txt b/llms.txt index b6d3ac2..75d51ff 100644 --- a/llms.txt +++ b/llms.txt @@ -5,9 +5,17 @@ Always navigate to docs links using the .md extension for better readability. ## Get Started -- [LangWatch: The Complete LLMOps Platform](https://langwatch.ai/docs/introduction.md): Ship AI agents 8x faster with comprehensive observability, evaluation, and prompt optimization. Open-source platform, with over 2.5k stars on GitHub. -- [Better Agents](https://langwatch.ai/docs/better-agents/overview.md): Build reliable, testable, production-grade AI agents with Better Agents CLI - the reliability layer for agent development +- [LangWatch: The Complete LLMOps Platform](https://langwatch.ai/docs/introduction.md): Ship AI agents 8x faster with comprehensive observability, evaluation, and prompt optimization. Open-source platform, with over 3k stars on GitHub. + +### LangWatch Skills + +- [Skills Directory](https://langwatch.ai/docs/skills/directory.md): Get started with LangWatch in seconds. Copy a prompt, install a skill, or set up the MCP — your AI agent does the rest. +- [Code Prompts](https://langwatch.ai/docs/skills/code-prompts.md): Copy a prompt, paste it into your coding assistant, done. +- [Platform Prompts](https://langwatch.ai/docs/skills/platform-prompts.md): Ask your chat assistant to query performance, set up evaluators, and create scenarios. +- [Prompt Recipes](https://langwatch.ai/docs/skills/recipes.md): Domain-specific, actionable recipes for improving your AI agent. + - [LangWatch MCP Server](https://langwatch.ai/docs/integration/mcp.md): Use the LangWatch MCP Server to extend your coding assistant with deep LangWatch insights for tracing, testing, and agent evaluations. +- [Better Agents](https://langwatch.ai/docs/better-agents/overview.md): Build reliable, testable, production-grade AI agents with Better Agents CLI - the reliability layer for agent development ## Agent Simulations @@ -82,11 +90,6 @@ Always navigate to docs links using the .md extension for better readability. - [Capturing Metadata and Attributes](https://langwatch.ai/docs/integration/python/tutorials/capturing-metadata.md): Learn how to enrich your traces and spans with custom metadata and attributes using the LangWatch Python SDK. - [Capturing Metadata and Attributes](https://langwatch.ai/docs/integration/typescript/tutorials/capturing-metadata.md): Learn how to enrich your traces and spans with custom metadata and attributes using the LangWatch TypeScript SDK. -#### Tracking Conversations - -- [Tracking Conversations](https://langwatch.ai/docs/integration/python/tutorials/tracking-conversations.md): Group related traces into conversations using thread_id so you can view and evaluate entire chat sessions in LangWatch. -- [Tracking Conversations](https://langwatch.ai/docs/integration/typescript/tutorials/tracking-conversations.md): Group related traces into conversations using thread_id so you can view and evaluate entire chat sessions in LangWatch. - #### Tracking LLM Costs - [Tracking LLM Costs and Tokens](https://langwatch.ai/docs/integration/python/tutorials/tracking-llm-costs.md): Track LLM costs and tokens with LangWatch to monitor efficiency and support performance evaluations in agent testing. @@ -306,9 +309,9 @@ Always navigate to docs links using the .md extension for better readability. ## Traces - [Overview](https://langwatch.ai/docs/api-reference/traces/overview.md): Search, retrieve, and share LangWatch traces via the REST API. Traces capture the full execution of your LLM pipelines including all spans, evaluations, and metadata. -- [Search traces](https://langwatch.ai/docs/api-reference/traces/search.md) -- [Get trace details](https://langwatch.ai/docs/api-reference/traces/get-trace.md) +- [Get Trace Details](https://langwatch.ai/docs/api-reference/traces/get-trace-details.md) - [Get thread details](https://langwatch.ai/docs/api-reference/traces/get-thread-details.md) +- [Search Traces](https://langwatch.ai/docs/api-reference/traces/search-traces.md) - [Create public path for single trace](https://langwatch.ai/docs/api-reference/traces/create-public-trace-path.md) - [Delete an existing public path for a trace](https://langwatch.ai/docs/api-reference/traces/delete-public-trace-path.md) diff --git a/skills/code-prompts.mdx b/skills/code-prompts.mdx index 2a0ba0a..cfdaf90 100644 --- a/skills/code-prompts.mdx +++ b/skills/code-prompts.mdx @@ -20,11 +20,11 @@ import { PROMPTS } from "/snippets/prompts-data.jsx" - - - - - + + + + + diff --git a/skills/directory.mdx b/skills/directory.mdx index af6a4db..5f3ac0d 100644 --- a/skills/directory.mdx +++ b/skills/directory.mdx @@ -12,11 +12,11 @@ Don't want to install skills? Copy a ready-to-paste prompt instead: [Code Prompt Install any skill with a single command: - - - - - + + + + + **Starting an agent from scratch?** Use [Better Agents](/better-agents/overview) to scaffold a production-ready project with all LangWatch features built in. diff --git a/skills/recipes.mdx b/skills/recipes.mdx index 2a78e8b..3f8fbb6 100644 --- a/skills/recipes.mdx +++ b/skills/recipes.mdx @@ -19,11 +19,11 @@ import { PROMPTS } from "/snippets/prompts-data.jsx" - - - - - - + + + + + + diff --git a/snippets/copy-line.jsx b/snippets/copy-line.jsx index bc0a1e2..c0d78f6 100644 --- a/snippets/copy-line.jsx +++ b/snippets/copy-line.jsx @@ -1,4 +1,8 @@ -import React, { useState } from "react"; +const { useState } = React; + +const trackEvent = (name, props) => { + try { window.posthog?.capture(name, props); } catch {} +}; export const CopyLine = ({ text }) => { const [copied, setCopied] = useState(false); @@ -6,26 +10,14 @@ export const CopyLine = ({ text }) => { const handleCopy = () => { navigator.clipboard.writeText(text); setCopied(true); + trackEvent("docs_copy_line", { text }); setTimeout(() => setCopied(false), 2000); }; return (
{ e.currentTarget.style.background = "var(--bg-hover, #f9fafb)"; }} - onMouseOut={(e) => { e.currentTarget.style.background = "transparent"; }} > "{text}"
diff --git a/snippets/prompts-data.jsx b/snippets/prompts-data.jsx index 86d1b37..22b9283 100644 --- a/snippets/prompts-data.jsx +++ b/snippets/prompts-data.jsx @@ -1,7 +1,8 @@ // Auto-generated from skills/_compiled/*.docs.txt +// Regenerate with: bash skills/_compiled/generate.sh then run the generation script export const PROMPTS = { - tracing: `Instrument my code with LangWatch + tracing: `Add LangWatch Tracing to Your Code You are using LangWatch for your AI agent project. Follow these instructions. @@ -56,12 +57,6 @@ Or add to \`~/.claude.json\` or \`.mcp.json\` in the project: ## For other editors Add to your editor's MCP settings file using the JSON config above. -## For ChatGPT, Claude Chat, or other web assistants -Use the hosted remote MCP server: -- URL: \`https://mcp.langwatch.ai/sse\` -- Authentication: Bearer Token with your LangWatch API key -- Get a key at https://app.langwatch.ai/authorize - **Tip:** If \`LANGWATCH_API_KEY\` is already in the project's \`.env\` file, use that same key for the MCP configuration. If MCP installation fails, see # Fetching LangWatch Docs Without MCP @@ -138,10 +133,9 @@ Run the application and check that traces appear in your LangWatch dashboard at - Do NOT invent instrumentation patterns — always read the docs for the specific framework - Do NOT skip the \`langwatch.setup()\` call in Python - Do NOT forget to add LANGWATCH_API_KEY to .env -- Do NOT use \`platform_\` MCP tools — this skill is about adding code, not creating platform resources -`, +- Do NOT use \`platform_\` MCP tools — this skill is about adding code, not creating platform resources`, - evaluations: `Set up evaluations for my agent + evaluations: `Set Up Evaluations for Your Agent You are using LangWatch for your AI agent project. Follow these instructions. @@ -316,12 +310,6 @@ Or add to \`~/.claude.json\` or \`.mcp.json\` in the project: ## For other editors Add to your editor's MCP settings file using the JSON config above. -## For ChatGPT, Claude Chat, or other web assistants -Use the hosted remote MCP server: -- URL: \`https://mcp.langwatch.ai/sse\` -- Authentication: Bearer Token with your LangWatch API key -- Get a key at https://app.langwatch.ai/authorize - **Tip:** If \`LANGWATCH_API_KEY\` is already in the project's \`.env\` file, use that same key for the MCP configuration. If MCP installation fails, see # Fetching LangWatch Docs Without MCP @@ -554,10 +542,9 @@ Go to https://app.langwatch.ai and: - Monitors **measure** (async), guardrails **act** (sync, via code with \`as_guardrail=True\`) — both are online evaluation - Always set up \`LANGWATCH_API_KEY\` in \`.env\` - Always call \`discover_schema\` before creating evaluators via MCP to understand available types -- Do NOT create prompts with \`langwatch prompt create\` CLI when using the platform approach — that's for code-based projects -`, +- Do NOT create prompts with \`langwatch prompt create\` CLI when using the platform approach — that's for code-based projects`, - scenarios: `Add scenario tests for my agent + scenarios: `Test Your Agent with Scenarios You are using LangWatch for your AI agent project. Follow these instructions. @@ -742,12 +729,6 @@ Or add to \`~/.claude.json\` or \`.mcp.json\` in the project: ## For other editors Add to your editor's MCP settings file using the JSON config above. -## For ChatGPT, Claude Chat, or other web assistants -Use the hosted remote MCP server: -- URL: \`https://mcp.langwatch.ai/sse\` -- Authentication: Bearer Token with your LangWatch API key -- Get a key at https://app.langwatch.ai/authorize - **Tip:** If \`LANGWATCH_API_KEY\` is already in the project's \`.env\` file, use that same key for the MCP configuration. If MCP installation fails, see # Fetching LangWatch Docs Without MCP @@ -791,7 +772,7 @@ scenario.configure(default_model="openai/gpt-5-mini") For TypeScript, create a \`scenario.config.mjs\` file: \`\`\`typescript // scenario.config.mjs -import { defineConfig } from "@langwatch/scenario/config"; +import { defineConfig } from "@langwatch/scenario/integrations/vitest/config"; import { openai } from "@ai-sdk/openai"; export default defineConfig({ @@ -1028,12 +1009,6 @@ Or add to \`~/.claude.json\` or \`.mcp.json\` in the project: ## For other editors Add to your editor's MCP settings file using the JSON config above. -## For ChatGPT, Claude Chat, or other web assistants -Use the hosted remote MCP server: -- URL: \`https://mcp.langwatch.ai/sse\` -- Authentication: Bearer Token with your LangWatch API key -- Get a key at https://app.langwatch.ai/authorize - **Tip:** If \`LANGWATCH_API_KEY\` is already in the project's \`.env\` file, use that same key for the MCP configuration. ### Step 2: Understand the Scenario Schema @@ -1097,10 +1072,9 @@ For TypeScript: \`npx vitest run\` - Do NOT use \`fetch_scenario_docs\` for SDK documentation — that's for code-based testing - Write criteria as natural language descriptions, not regex patterns - Create focused scenarios — each should test one specific behavior -- Always call \`discover_schema\` first to understand the scenario format -`, +- Always call \`discover_schema\` first to understand the scenario format`, - prompts: `Version my prompts with LangWatch + prompts: `Version Your Prompts with LangWatch Prompts CLI You are using LangWatch for your AI agent project. Follow these instructions. @@ -1201,12 +1175,6 @@ Or add to \`~/.claude.json\` or \`.mcp.json\` in the project: ## For other editors Add to your editor's MCP settings file using the JSON config above. -## For ChatGPT, Claude Chat, or other web assistants -Use the hosted remote MCP server: -- URL: \`https://mcp.langwatch.ai/sse\` -- Authentication: Bearer Token with your LangWatch API key -- Get a key at https://app.langwatch.ai/authorize - **Tip:** If \`LANGWATCH_API_KEY\` is already in the project's \`.env\` file, use that same key for the MCP configuration. If MCP installation fails, see # Fetching LangWatch Docs Without MCP @@ -1302,10 +1270,9 @@ Check that your prompts appear on https://app.langwatch.ai in the Prompts sectio - Do NOT hardcode prompts in application code — always use \`langwatch.prompts.get()\` to fetch managed prompts - Do NOT duplicate prompt text as a fallback (no try/catch around \`prompts.get\` with a hardcoded string) — this silently defeats versioning - Do NOT manually edit \`prompts.json\` — use the CLI commands (\`langwatch prompt init\`, \`langwatch prompt create\`, \`langwatch prompt sync\`) -- Do NOT skip \`langwatch prompt sync\` — prompts must be synced to the platform after creation -`, +- Do NOT skip \`langwatch prompt sync\` — prompts must be synced to the platform after creation`, - analytics: `How is my agent performing? + analytics: `Analyze Agent Performance with LangWatch You are using LangWatch for your AI agent project. Follow these instructions. @@ -1346,12 +1313,6 @@ Or add to \`~/.claude.json\` or \`.mcp.json\` in the project: ## For other editors Add to your editor's MCP settings file using the JSON config above. -## For ChatGPT, Claude Chat, or other web assistants -Use the hosted remote MCP server: -- URL: \`https://mcp.langwatch.ai/sse\` -- Authentication: Bearer Token with your LangWatch API key -- Get a key at https://app.langwatch.ai/authorize - **Tip:** If \`LANGWATCH_API_KEY\` is already in the project's \`.env\` file, use that same key for the MCP configuration. ## Step 2: Discover Available Metrics @@ -1408,10 +1369,9 @@ Summarize the data clearly for the user: - Do NOT try to write code -- this skill uses MCP tools only, no SDK installation or code changes - Do NOT hardcode metric names -- discover them dynamically so they stay correct as the platform evolves - Do NOT use \`platform_\` MCP tools for creating resources -- this skill is read-only analytics -- Do NOT present raw JSON to the user -- summarize the data in a clear, human-readable format -`, +- Do NOT present raw JSON to the user -- summarize the data in a clear, human-readable format`, - level_up: `Take my agent to the next level + level_up: `Add LangWatch Tracing to Your Code You are using LangWatch for your AI agent project. Follow these instructions. @@ -1466,12 +1426,6 @@ Or add to \`~/.claude.json\` or \`.mcp.json\` in the project: ## For other editors Add to your editor's MCP settings file using the JSON config above. -## For ChatGPT, Claude Chat, or other web assistants -Use the hosted remote MCP server: -- URL: \`https://mcp.langwatch.ai/sse\` -- Authentication: Bearer Token with your LangWatch API key -- Get a key at https://app.langwatch.ai/authorize - **Tip:** If \`LANGWATCH_API_KEY\` is already in the project's \`.env\` file, use that same key for the MCP configuration. If MCP installation fails, see # Fetching LangWatch Docs Without MCP @@ -2244,7 +2198,7 @@ scenario.configure(default_model="openai/gpt-5-mini") For TypeScript, create a \`scenario.config.mjs\` file: \`\`\`typescript // scenario.config.mjs -import { defineConfig } from "@langwatch/scenario/config"; +import { defineConfig } from "@langwatch/scenario/integrations/vitest/config"; import { openai } from "@ai-sdk/openai"; export default defineConfig({ @@ -2479,78 +2433,11 @@ The MCP must be configured with your LangWatch API key. - Do NOT use \`fetch_scenario_docs\` for SDK documentation — that's for code-based testing - Write criteria as natural language descriptions, not regex patterns - Create focused scenarios — each should test one specific behavior -- Always call \`discover_schema\` first to understand the scenario format -`, - - platform_analytics: `You are helping me analyze my AI agent's performance using LangWatch. +- Always call \`discover_schema\` first to understand the scenario format`, -IMPORTANT: You will need my LangWatch API key. Ask me for it and direct me to https://app.langwatch.ai/authorize if I don't have one. + recipe_debug_instrumentation: `Debug Your LangWatch Instrumentation -## Setup - -Install the LangWatch MCP server: - claude mcp add langwatch -- npx -y @langwatch/mcp-server --apiKey - -## What to do - -1. Call discover_schema with category "all" to learn available metrics -2. Call get_analytics to query: - - Total LLM cost (last 7 days) - - P95 latency trends - - Token usage over time - - Error rates -3. Use search_traces to find traces with errors or high latency -4. Present the findings clearly with key numbers and anomalies`, - - platform_scenarios: `You are helping me create scenario tests for my AI agent on the LangWatch platform. - -IMPORTANT: You will need my LangWatch API key. Ask me for it and direct me to https://app.langwatch.ai/authorize if I don't have one. - -## Setup - -Install the LangWatch MCP server: - claude mcp add langwatch -- npx -y @langwatch/mcp-server --apiKey - -## What to do - -1. Call discover_schema with category "scenarios" to understand the format -2. Create scenarios using platform_create_scenario for: - - Happy path: normal, expected interactions - - Edge cases: unusual inputs, unclear requests - - Error handling: when things go wrong - -For each scenario, define: - - name: A descriptive name for the test case - - situation: The context and user behavior to simulate - - criteria: What the agent should do (list of success criteria) - - labels: Tags for organization (optional) - -3. Use platform_list_scenarios to review all scenarios -4. Use platform_update_scenario to refine them - -Write criteria as natural language descriptions, not regex patterns. -Each scenario should test one specific behavior.`, - - platform_evaluators: `You are helping me set up evaluators for my AI agent on the LangWatch platform. - -IMPORTANT: You will need my LangWatch API key. Ask me for it and direct me to https://app.langwatch.ai/authorize if I don't have one. - -## Setup - -Install the LangWatch MCP server: - claude mcp add langwatch -- npx -y @langwatch/mcp-server --apiKey - -## What to do - -1. Call discover_schema with category "evaluators" to see available types -2. Use platform_list_evaluators to see existing evaluators -3. Create evaluators using platform_create_evaluator: - - LLM-as-judge evaluators for quality assessment - - Specific evaluator types matching your use case -4. Use platform_get_evaluator and platform_update_evaluator to review and refine -5. Then go to https://app.langwatch.ai to set up monitors using these evaluators`, - - recipe_debug_instrumentation: `You are using LangWatch for your AI agent project. Follow these instructions. +You are using LangWatch for your AI agent project. Follow these instructions. IMPORTANT: You will need a LangWatch API key. Check if LANGWATCH_API_KEY is already in the project's .env file. If not, ask the user for it — they can get one at https://app.langwatch.ai/authorize. If they have a LANGWATCH_ENDPOINT in .env, they are on a self-hosted instance — use that endpoint instead of app.langwatch.ai. First, try to install the LangWatch MCP server for access to documentation and platform tools. If installation fails, you can fetch docs directly via the URLs provided below. @@ -2608,10 +2495,11 @@ After fixes, compare before/after: | Spans not connected to traces | Missing \`@langwatch.trace()\` on entry function | Add trace decorator to the main function | | No labels on traces | Labels not set in trace metadata | Add \`metadata={"labels": ["feature"]}\` to trace update | | Missing user_id | User ID not passed to trace | Add \`user_id\` to trace metadata | -| Traces from different calls merged | Missing \`langwatch.setup()\` or trace context not propagated | Ensure \`langwatch.setup()\` called at startup | -`, +| Traces from different calls merged | Missing \`langwatch.setup()\` or trace context not propagated | Ensure \`langwatch.setup()\` called at startup |`, - recipe_improve_setup: `You are using LangWatch for your AI agent project. Follow these instructions. + recipe_improve_setup: `Improve Your LangWatch Setup + +You are using LangWatch for your AI agent project. Follow these instructions. IMPORTANT: You will need a LangWatch API key. Check if LANGWATCH_API_KEY is already in the project's .env file. If not, ask the user for it — they can get one at https://app.langwatch.ai/authorize. If they have a LANGWATCH_ENDPOINT in .env, they are on a self-hosted instance — use that endpoint instead of app.langwatch.ai. First, try to install the LangWatch MCP server for access to documentation and platform tools. If installation fails, you can fetch docs directly via the URLs provided below. @@ -2680,10 +2568,11 @@ After each improvement: - Do NOT skip the audit — you can't suggest improvements without understanding the current state - Do NOT give generic advice — every suggestion must be specific to this codebase - Do NOT overwhelm with 10 suggestions — pick the top 2-3 -- Do NOT skip running/verifying improvements -`, +- Do NOT skip running/verifying improvements`, + + recipe_evaluate_multimodal: `Evaluate Your Multimodal Agent - recipe_evaluate_multimodal: `You are using LangWatch for your AI agent project. Follow these instructions. +You are using LangWatch for your AI agent project. Follow these instructions. IMPORTANT: You will need a LangWatch API key. Check if LANGWATCH_API_KEY is already in the project's .env file. If not, ask the user for it — they can get one at https://app.langwatch.ai/authorize. If they have a LANGWATCH_ENDPOINT in .env, they are on a self-hosted instance — use that endpoint instead of app.langwatch.ai. First, try to install the LangWatch MCP server for access to documentation and platform tools. If installation fails, you can fetch docs directly via the URLs provided below. @@ -2773,10 +2662,11 @@ Run the evaluation, review results, fix issues, re-run until quality is acceptab - Do NOT evaluate multimodal agents with text-only metrics — use image-aware judges - Do NOT skip testing with real file formats — synthetic descriptions aren't enough - Do NOT forget to handle file loading errors in evaluations -- Do NOT use generic test images — use domain-specific ones matching the agent's purpose -`, +- Do NOT use generic test images — use domain-specific ones matching the agent's purpose`, - recipe_generate_rag_dataset: `You are using LangWatch for your AI agent project. Follow these instructions. + recipe_generate_rag_dataset: `Generate a RAG Evaluation Dataset + +You are using LangWatch for your AI agent project. Follow these instructions. IMPORTANT: You will need a LangWatch API key. Check if LANGWATCH_API_KEY is already in the project's .env file. If not, ask the user for it — they can get one at https://app.langwatch.ai/authorize. If they have a LANGWATCH_ENDPOINT in .env, they are on a self-hosted instance — use that endpoint instead of app.langwatch.ai. First, try to install the LangWatch MCP server for access to documentation and platform tools. If installation fails, you can fetch docs directly via the URLs provided below. @@ -2876,10 +2766,11 @@ Before using the dataset: - Do NOT skip negative cases — testing "I don't know" is crucial for RAG - Do NOT use the same question pattern for every entry — diversify types - Do NOT forget to include the relevant context per row -- Do NOT generate expected outputs that aren't actually in the knowledge base -`, +- Do NOT generate expected outputs that aren't actually in the knowledge base`, - recipe_test_compliance: `You are using LangWatch for your AI agent project. Follow these instructions. + recipe_test_compliance: `Test Your Agent's Compliance Boundaries + +You are using LangWatch for your AI agent project. Follow these instructions. IMPORTANT: You will need a LangWatch API key. Check if LANGWATCH_API_KEY is already in the project's .env file. If not, ask the user for it — they can get one at https://app.langwatch.ai/authorize. If they have a LANGWATCH_ENDPOINT in .env, they are on a self-hosted instance — use that endpoint instead of app.langwatch.ai. First, try to install the LangWatch MCP server for access to documentation and platform tools. If installation fails, you can fetch docs directly via the URLs provided below. @@ -3012,10 +2903,11 @@ Create reusable criteria for your domain: - Do NOT only test with polite, straightforward questions — adversarial probing is essential - Do NOT skip multi-turn escalation scenarios — single-turn tests miss persistence attacks - Do NOT use weak criteria like "agent is helpful" — be specific about what it must NOT do -- Do NOT forget to test the "empathetic but firm" response — the agent should show care while maintaining boundaries -`, +- Do NOT forget to test the "empathetic but firm" response — the agent should show care while maintaining boundaries`, - recipe_test_cli_usability: `You are using LangWatch for your AI agent project. Follow these instructions. + recipe_test_cli_usability: `Test Your CLI's Agent Usability + +You are using LangWatch for your AI agent project. Follow these instructions. IMPORTANT: You will need a LangWatch API key. Check if LANGWATCH_API_KEY is already in the project's .env file. If not, ask the user for it — they can get one at https://app.langwatch.ai/authorize. If they have a LANGWATCH_ENDPOINT in .env, they are on a self-hosted instance — use that endpoint instead of app.langwatch.ai. First, try to install the LangWatch MCP server for access to documentation and platform tools. If installation fails, you can fetch docs directly via the URLs provided below. @@ -3110,7 +3002,6 @@ Write scenarios where the agent makes a mistake and must recover: - Do NOT output errors without actionable guidance (the agent needs to know how to fix it) - DO make \`--help\` comprehensive on every subcommand - DO use non-zero exit codes for failures (agents check exit codes) -- DO output structured information (the agent can parse it) -`, +- DO output structured information (the agent can parse it)`, -}; \ No newline at end of file +}; diff --git a/snippets/skill-install.jsx b/snippets/skill-install.jsx index c0f9599..0eb5e88 100644 --- a/snippets/skill-install.jsx +++ b/snippets/skill-install.jsx @@ -1,87 +1,69 @@ -import React, { useState } from "react"; +const { useState } = React; + +const trackEvent = (name, props) => { + try { window.posthog?.capture(name, props); } catch {} +}; export const SkillInstall = ({ title, skill, slashCommand, highlighted }) => { - const [copied, setCopied] = useState(false); + const [installCopied, setInstallCopied] = useState(false); + const [cmdCopied, setCmdCopied] = useState(false); const installCmd = `npx skills add ${skill}`; - const handleCopy = () => { + const handleCopyInstall = () => { navigator.clipboard.writeText(installCmd); - setCopied(true); - setTimeout(() => setCopied(false), 2000); + setInstallCopied(true); + trackEvent("docs_copy_skill_install", { title, skill }); + setTimeout(() => setInstallCopied(false), 2000); }; - const CopyIcon = ({ copied }) => copied ? ( - - ) : ( - + const handleCopyCmd = () => { + navigator.clipboard.writeText(slashCommand); + setCmdCopied(true); + trackEvent("docs_copy_slash_command", { title, slashCommand }); + setTimeout(() => setCmdCopied(false), 2000); + }; + + const CopyIcon = ({ size }) => ( + + ); + + const CheckIcon = ({ size }) => ( + ); return ( -
+
+
+ {title} +
+
- {title} + >_ + {installCmd}
-
- >_ - {installCmd} -
- -
- Then use {slashCommand} in your coding agent +
+ Then use{" "} + {slashCommand} + {" "} + + {" "}in your coding agent
); diff --git a/style.css b/style.css index ca77612..70a7fab 100644 --- a/style.css +++ b/style.css @@ -120,3 +120,89 @@ code { max-width: 100%; overflow-x: auto; } + +/* ── LangWatch custom components ── */ + +/* CopyPrompt / CopyLine shared card */ +.lw-copy-prompt { + border: 1px solid #e5e7eb; + border-radius: 12px; + padding: 12px 16px; + display: flex; + align-items: center; + justify-content: space-between; + gap: 12px; + cursor: pointer; + transition: all 0.15s; + margin-bottom: 8px; +} +.lw-copy-prompt:hover { + background: #f9fafb; +} + +.lw-copy-btn { + border: 1px solid #e5e7eb; + transition: all 0.15s; +} + +/* Dark mode overrides */ +.dark .lw-copy-prompt { + border-color: #374151; +} +.dark .lw-copy-prompt:hover { + background: transparent; + border-color: #6b7280; +} +.dark .lw-copy-prompt:hover span { + color: #f9fafb; +} + +.dark .lw-copy-btn { + border-color: #374151; +} +.dark .lw-copy-prompt:hover .lw-copy-btn { + border-color: #6b7280; + color: #f9fafb; +} +.dark .lw-copy-btn-copied { + background: rgba(5, 150, 105, 0.15) !important; + border-color: rgba(5, 150, 105, 0.4) !important; +} + +/* SkillInstall card */ +.lw-skill-install { + border: 1px solid #e5e7eb; + border-radius: 12px; + padding: 20px 24px; + margin-bottom: 16px; +} +.lw-skill-install.lw-skill-highlighted { + border-color: rgba(225, 113, 0, 0.4); + background: rgba(225, 113, 0, 0.06); +} + +.dark .lw-skill-install { + border-color: #374151; +} +.dark .lw-skill-install.lw-skill-highlighted { + border-color: rgba(225, 113, 0, 0.4); +} + +/* Inline copy button (next to install cmd and slash command) */ +.lw-inline-copy-btn { + display: inline-flex; + align-items: center; + padding: 2px; + border: none; + background: transparent; + color: #9ca3af; + cursor: pointer; + transition: color 0.15s; + vertical-align: middle; +} +.lw-inline-copy-btn:hover { + color: #6b7280; +} +.dark .lw-inline-copy-btn:hover { + color: #d1d5db; +}