From e66f0805ea986bb9a1d8d644bd8bb0d9d41ddbe8 Mon Sep 17 00:00:00 2001 From: prrao87 <35005448+prrao87@users.noreply.github.com> Date: Mon, 9 Mar 2026 12:58:41 -0400 Subject: [PATCH] Fix YouTube transcript capture --- docs/docs.json | 6 +- docs/integrations/ai/agno.mdx | 151 ++++++++++++++ docs/integrations/data/phidata.mdx | 218 -------------------- docs/snippets/integrations.mdx | 30 +-- tests/py/test_integrations.py | 310 ++++++----------------------- 5 files changed, 222 insertions(+), 493 deletions(-) create mode 100644 docs/integrations/ai/agno.mdx delete mode 100644 docs/integrations/data/phidata.mdx diff --git a/docs/docs.json b/docs/docs.json index f4e0cce..fe7eb4d 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -273,7 +273,6 @@ "integrations/data/pandas_and_pyarrow", "integrations/data/polars_arrow", "integrations/data/dlt", - "integrations/data/phidata", "integrations/data/voxel51" ] }, @@ -281,6 +280,7 @@ "group": "AI Platforms & Frameworks", "pages": [ "integrations/ai/huggingface", + "integrations/ai/agno", "integrations/ai/langchain", "integrations/ai/llamaIndex", "integrations/ai/genkit", @@ -391,6 +391,10 @@ "source": "/integrations/frameworks/:slug*", "destination": "integrations/ai/:slug*" }, + { + "source": "/integrations/data/phidata", + "destination": "integrations/ai/agno" + }, { "source": "/tutorials/rag/:slug*", "destination": "tutorials/agents/:slug*" diff --git a/docs/integrations/ai/agno.mdx b/docs/integrations/ai/agno.mdx new file mode 100644 index 0000000..3537dbc --- /dev/null +++ b/docs/integrations/ai/agno.mdx @@ -0,0 +1,151 @@ +--- +title: "Agno" +sidebarTitle: "Agno" +description: "Build a search assistant using the Agno agent framework with LanceDB as the knowledge backend." +--- + +import { + PyFrameworksAgnoAgent, + PyFrameworksAgnoCliChat, + PyFrameworksAgnoIngestYoutube, + PyFrameworksAgnoSetup, +} from '/snippets/integrations.mdx'; + +[Agno](https://docs.agno.com/introduction) is a framework for building agentic AI applications. +It supports LanceDB as a knowledge backend, allowing you to easily ingest and retrieve external content for your agents. + +When you pair Agno's `Knowledge` system with LanceDB, you get a clean Agentic RAG setup. +We'll walk through the steps below to build a YouTube transcript-aware Agno assistant that can: +- Ingest a transcript from a YouTube video via the YouTube API +- Store embeddings and metadata in LanceDB +- Retrieve context during responses with hybrid search +- Ask questions about the video content in a CLI chat loop + +## Prerequisites + +Install dependencies: + + +```bash pip icon="terminal" +pip install -U agno openai lancedb youtube-transcript-api beautifulsoup4 +``` + +```bash uv icon="terminal" +uv add agno openai lancedb youtube-transcript-api beautifulsoup4 +``` + + +## Step 1: Configure LanceDB-backed knowledge + +First, you can initialize the core `Knowledge` object that your agent will use for retrieval. +It configures LanceDB as the vector store, enables hybrid search with native LanceDB FTS, and sets the embedding model. + + + {PyFrameworksAgnoSetup} + + +## Step 2: Fetch and ingest the YouTube transcript + +Next, extract a YouTube video ID, fetch the full transcript, and flatten it into text for indexing. +The snippet shown below then inserts that transcript text into the Agno knowledge base, which writes vectors and metadata to LanceDB. + + + {PyFrameworksAgnoIngestYoutube} + + + +This path explicitly fetches the transcript first, then inserts transcript text into LanceDB through Agno. + + +## Step 3: Create an Agno agent with knowledge search + +The next step is to construct an Agno `Agent` and attach the knowledge base you just populated. +With `search_knowledge=True`, the agent performs retrieval before answering, so responses stay grounded in transcript context. + +In Agno, retrieval is exposed as a tool call that the model can invoke at runtime. +When `search_knowledge=True`, Agno makes a knowledge-search tool (shown in output as `search_knowledge_base(...)`) available to the model; the model decides when to call it, Agno executes the tool, and the returned context is fed back into the final answer. + + + {PyFrameworksAgnoAgent} + + +## Step 4: Start a CLI chat loop + +You can now ask an initial question and then start an interactive loop for follow-up queries. +Each prompt runs through the same retrieval pipeline, so you can iteratively inspect what the transcript contains. + + + {PyFrameworksAgnoCliChat} + + + +Want local-first inference? Replace OpenAI model/embedder classes with Agno's Ollama providers. See Agno's Ollama knowledge examples: [docs.agno.com/examples/models/ollama/chat/knowledge](https://docs.agno.com/examples/models/ollama/chat/knowledge). + + +### Question 1 + +The following question is asked in the CLI chat loop: +``` +┏━ Message ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +┃ ┃ +┃ Q: What kinds of data can LanceDB handle? ┃ +┃ ┃ +┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +┏━ Tool Calls ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +┃ ┃ +┃ • search_knowledge_base(query=What kinds of data can LanceDB handle?) ┃ +┃ • search_knowledge_base(query=LanceDB images audio video handle kinds of data ┃ +┃ can handle 'LanceDB can handle' 'kinds of data' 'images audio video' transcript) ┃ +┃ ┃ +┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +┏━ Response (19.1s) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +┃ ┃ +┃ ┃ +┃ • Images, audio, video — i.e., multimodal AI data and “all manners of things ┃ +┃ you don't put into traditional databases” (per the transcript). ┃ +┃ ┃ +┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +``` + +We get the response based on the transcript's contents as expected. + +### Question 2 + +Let's ask a more specific question about the CEO of LanceDB, which is also in the transcript: + +``` +You: What is the name of the CEO of LanceDB? +INFO Found 10 documents +┏━ Message ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +┃ ┃ +┃ What is the name of the CEO of LanceDB? ┃ +┃ ┃ +┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +┏━ Tool Calls ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +┃ ┃ +┃ • search_knowledge_base(query=CEO of LanceDB) ┃ +┃ ┃ +┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +┏━ Response (16.7s) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +┃ ┃ +┃ ┃ +┃ • According to the retrieved YouTube transcript/title, the CEO of LanceDB is ┃ +┃ Chang She. ┃ +┃ ┃ +┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +``` + +We get the response based on the transcript's contents and title as expected. + +## Why this works well + +To start, LanceDB OSS can run from a local directory, so transcript data can stay on your machine when you are using the OSS stack. + +- You do not need to maintain a separate transcript parser in your application code. +- You do not need to hand-roll chunking and retrieval orchestration across multiple modules. +- One explicit Agno `Knowledge` object, backed by LanceDB, defines both ingestion and search behavior in one place. +- Fewer moving parts means the tutorial stays readable and the same pattern is easier to carry into production code. + +As your application needs grow, you can migrate to LanceDB [Enterprise](/enterprise) for +convenience features like automatic compaction and reindexing and the ability to scale to +really large datasets. diff --git a/docs/integrations/data/phidata.mdx b/docs/integrations/data/phidata.mdx deleted file mode 100644 index d056602..0000000 --- a/docs/integrations/data/phidata.mdx +++ /dev/null @@ -1,218 +0,0 @@ ---- -title: "Phidata" -sidebarTitle: "PhiData" ---- - -import { - PyPlatformsPhidataCliChat, - PyPlatformsPhidataDocumentModel, - PyPlatformsPhidataLoadKnowledgeBase, - PyPlatformsPhidataOllamaAssistant, - PyPlatformsPhidataOllamaKnowledgeBase, - PyPlatformsPhidataOllamaSetup, - PyPlatformsPhidataOpenaiAssistant, - PyPlatformsPhidataOpenaiKnowledgeBase, - PyPlatformsPhidataOpenaiSetup, - PyPlatformsPhidataTranscriptModule, -} from '/snippets/integrations.mdx'; - -[Phidata](https://docs.phidata.com/introduction) is a framework for building **AI Assistants** with long-term memory, contextual knowledge, and the ability to take actions using function calling. It helps turn general-purpose LLMs into specialized assistants tailored to your use case by extending its capabilities using **memory**, **knowledge**, and **tools**. - -- **Memory**: Stores chat history in a **database** and enables LLMs to have long-term conversations. -- **Knowledge**: Stores information in a **vector database** and provides LLMs with business context. (Here we will use LanceDB) -- **Tools**: Enable LLMs to take actions like pulling data from an **API**, **sending emails** or **querying a database**, etc. - -![example](https://raw.githubusercontent.com/lancedb/assets/refs/heads/main/docs/assets/integration/phidata_assistant.png) - -Memory & knowledge make LLMs _smarter_ while tools make them _autonomous_. - -LanceDB is a vector database and its integration into Phidata makes it easy for us to provide a **knowledge base** to LLMs. It enables us to store information as embeddings and search for the **results** similar to ours using **query**. - - -**What is a Knowledge Base?** - -Knowledge Base is a database of information that the Assistant can search to improve its responses. This information is stored in a vector database and provides LLMs with business context, which makes them respond in a context-aware manner. - -While any type of storage can act as a knowledge base, vector databases offer the best solution for retrieving relevant results from dense information quickly. - - -Let's see how using LanceDB inside Phidata helps in making LLM more useful: - -## Prerequisites: install and import necessary dependencies - -**Create a virtual environment** - -1. install virtualenv package - -pip install virtualenv - -2. Create a directory for your project and go to the directory and create a virtual environment inside it. - -mkdir phi - - -cd phi - - -python -m venv phidata_ - - -**Activating virtual environment** - -1. from inside the project directory, run the following command to activate the virtual environment. - -phidata_/Scripts/activate - - -**Install the following packages in the virtual environment** - -pip install lancedb phidata youtube_transcript_api openai ollama numpy pandas - - -**Create python files and import necessary libraries** - -You need to create two files -- `transcript.py` and `ollama_assistant.py` or `openai_assistant.py` - - - {PyPlatformsPhidataOpenaiSetup} - - - - {PyPlatformsPhidataOllamaSetup} - - - - {PyPlatformsPhidataTranscriptModule} - - - -If creating Ollama assistant, download and install Ollama [from here](https://ollama.com/) and then run the Ollama instance in the background. Also, download the required models using `ollama pull `. Check out the models [here](https://ollama.com/library) - - -**Run the following command to deactivate the virtual environment if needed** - -deactivate - - -## **Step 1** - Create a Knowledge Base for AI Assistant using LanceDB - - - {PyPlatformsPhidataOpenaiKnowledgeBase} - - - - - {PyPlatformsPhidataOllamaKnowledgeBase} - - -Check out the list of **embedders** supported by **Phidata** and their usage [here](https://docs.phidata.com/embedder/introduction). - -Here we have used `TextKnowledgeBase`, which loads text/docx files to the knowledge base. - -Let's see all the parameters that `TextKnowledgeBase` takes - - -| Name| Type | Purpose | Default | -|:----|:-----|:--------|:--------| -|`path`|`Union[str, Path]`| Path to text file(s). It can point to a single text file or a directory of text files.| provided by user | -|`formats`|`List[str]`| File formats accepted by this knowledge base. |`[".txt"]`| -|`vector_db`|`VectorDb`| Vector Database for the Knowledge Base. Phidata provides a wrapper around many vector DBs, you can import it like this - `from phi.vectordb.lancedb import LanceDb` | provided by user | -|`num_documents`|`int`| Number of results (documents/vectors) that vector search should return. |`5`| -|`reader`|`TextReader`| Phidata provides many types of reader objects which read data, clean it and create chunks of data, encapsulate each chunk inside an object of the `Document` class, and return **`List[Document]`**. | `TextReader()` | -|`optimize_on`|`int`| It is used to specify the number of documents on which to optimize the vector database. Supposed to create an index. |`1000`| - -??? Tip "Wonder! What is `Document` class?" - We know that, before storing the data in vectorDB, we need to split the data into smaller chunks upon which embeddings will be created and these embeddings along with the chunks will be stored in vectorDB. When the user queries over the vectorDB, some of these embeddings will be returned as the result based on the semantic similarity with the query. - - When the user queries over vectorDB, the queries are converted into embeddings, and a nearest neighbor search is performed over these query embeddings which returns the embeddings that correspond to most semantically similar chunks(parts of our data) present in vectorDB. - - Here, a "Document" is a class in Phidata. Since there is an option to let Phidata create and manage embeddings, it splits our data into smaller chunks(as expected). It does not directly create embeddings on it. Instead, it takes each chunk and encapsulates it inside the object of the `Document` class along with various other metadata related to the chunk. Then embeddings are created on these `Document` objects and stored in vectorDB. - - - {PyPlatformsPhidataDocumentModel} - - -However, using Phidata you can load many other types of data in the knowledge base(other than text). Check out [Phidata Knowledge Base](https://docs.phidata.com/knowledge/introduction) for more information. - -Let's dig deeper into the `vector_db` parameter and see what parameters `LanceDb` takes - - -| Name| Type | Purpose | Default | -|:----|:-----|:--------|:--------| -|`embedder`|`Embedder`| Phidata provides many Embedders that abstract the interaction with embedding APIs and utilize it to generate embeddings. Check out other embedders [here](https://docs.phidata.com/embedder/introduction) | `OpenAIEmbedder` | -|`distance`|`List[str]`| The choice of distance metric used to calculate the similarity between vectors, which directly impacts search results and performance in vector databases. |`Distance.cosine`| -|`connection`|`lancedb.db.LanceTable`| LanceTable can be accessed through `.connection`. You can connect to an existing table of LanceDB, created outside of Phidata, and utilize it. If not provided, it creates a new table using `table_name` parameter and adds it to `connection`. |`None`| -|`uri`|`str`| It specifies the directory location of **LanceDB database** and establishes a connection that can be used to interact with the database. | `"/tmp/lancedb"` | -|`table_name`|`str`| If `connection` is not provided, it initializes and connects to a new **LanceDB table** with a specified(or default) name in the database present at `uri`. |`"phi"`| -|`nprobes`|`int`| It refers to the number of partitions that the search algorithm examines to find the nearest neighbors of a given query vector. Higher values will yield better recall (more likely to find vectors if they exist) at the expense of latency. |`20`| - - - -Since we just initialized the KnowledgeBase. The VectorDB table that corresponds to this Knowledge Base is not yet populated with our data. It will be populated in **Step 3**, once we perform the `load` operation. - -You can check the state of the LanceDB table using - `knowledge_base.vector_db.connection.to_pandas()` - - -Now that the Knowledge Base is initialized, , we can go to **step 2**. - -## **Step 2** - Create an assistant with our choice of LLM and reference to the knowledge base. - - - {PyPlatformsPhidataOpenaiAssistant} - - - - {PyPlatformsPhidataOllamaAssistant} - - -Assistants add **memory**, **knowledge**, and **tools** to LLMs. Here we will add only **knowledge** in this example. - -Whenever we will give a query to LLM, the assistant will retrieve relevant information from our **Knowledge Base**(table in LanceDB) and pass it to LLM along with the user query in a structured way. - -- The `add_references_to_prompt=True` always adds information from the knowledge base to the prompt, regardless of whether it is relevant to the question. - -To know more about an creating assistant in Phidata, check out [Phidata docs](https://docs.phidata.com/assistants/introduction) here. - -## **Step 3** - Load data to Knowledge Base. - - - {PyPlatformsPhidataLoadKnowledgeBase} - -The above code loads the data to the Knowledge Base(LanceDB Table) and now it is ready to be used by the assistant. - -| Name| Type | Purpose | Default | -|:----|:-----|:--------|:--------| -|`recreate`|`bool`| If True, it drops the existing table and recreates the table in the vectorDB. |`False`| -|`upsert`|`bool`| If True and the vectorDB supports upsert, it will upsert documents to the vector db. | `False` | -|`skip_existing`|`bool`| If True, skips documents that already exist in the vectorDB when inserting. |`True`| - -> **Tip · What is upsert?** -> Upsert is a database operation that combines “update” and “insert”. It updates existing records if a document with the same identifier exists, or inserts new records if no matching record exists. This keeps the knowledge base current without manual checks. - -During the Load operation, Phidata directly interacts with the LanceDB library and performs the loading of the table with our data in the following steps - - -1. **Creates** and **initializes** the table if it does not exist. - -2. Then it **splits** our data into smaller **chunks**. - - > **Question · How do they create chunks?** - > **Phidata** provides multiple knowledge-base types depending on the source data. Most of them (except the LlamaIndexKnowledgeBase and LangChainKnowledgeBase) expose a `document_lists` iterator. During the load operation, this iterator reads the input (for example, text files), splits it into chunks, wraps each chunk in a `Document`, and yields lists of those `Document` objects. - -3. Then **embeddings** are created on these chunks are **inserted** into the LanceDB Table - - > **Question · How do they insert the chunks into LanceDB?** - > Each list of `Document` objects from the previous step is processed as follows: - > - > - Generate embeddings for every `Document`. - > - Clean the `content` field so only the text you care about is persisted. - > - Prepare a payload with the `id`, the embedding (`vector`), and any metadata needed for retrieval. - > - Add the prepared rows to the LanceDB table. - -4. Now the internal state of `knowledge_base` is changed (embeddings are created and loaded in the table ) and it **ready to be used by assistant**. - -## **Step 4** - Start a cli chatbot with access to the Knowledge base - - - {PyPlatformsPhidataCliChat} - - - -For more information and amazing cookbooks of Phidata, read the [Phidata documentation](https://docs.phidata.com/introduction) and also visit [LanceDB x Phidata docmentation](https://docs.phidata.com/vectordb/lancedb). \ No newline at end of file diff --git a/docs/snippets/integrations.mdx b/docs/snippets/integrations.mdx index e956e45..e25ba26 100644 --- a/docs/snippets/integrations.mdx +++ b/docs/snippets/integrations.mdx @@ -44,6 +44,14 @@ export const PyEmbeddingVoyageaiMultimodal = "import tempfile\nfrom pathlib impo export const PyEmbeddingVoyageaiUsage = "import tempfile\nfrom pathlib import Path\n\nimport lancedb\nfrom lancedb.embeddings import EmbeddingFunctionRegistry\nfrom lancedb.pydantic import LanceModel, Vector\n\nvoyageai = (\n EmbeddingFunctionRegistry.get_instance().get(\"voyageai\").create(name=\"voyage-3\")\n)\n\nclass TextModel(LanceModel):\n text: str = voyageai.SourceField()\n vector: Vector(voyageai.ndims()) = voyageai.VectorField()\n\ndata = [{\"text\": \"hello world\"}, {\"text\": \"goodbye world\"}]\n\ndb = lancedb.connect(str(Path(tempfile.mkdtemp()) / \"voyageai-demo\"))\ntbl = db.create_table(\"test\", schema=TextModel, mode=\"overwrite\")\n\ntbl.add(data)\n"; +export const PyFrameworksAgnoAgent = "agent = Agent(\n model=OpenAIResponses(id=\"gpt-5-mini\"),\n knowledge=knowledge,\n search_knowledge=True,\n instructions=\"Search the transcript and answer only from retrieved context.\",\n markdown=True,\n)\n"; + +export const PyFrameworksAgnoCliChat = "agent.print_response(\n \"Summarize the loaded video transcript in 5 concise bullet points.\",\n stream=True,\n)\nwhile True:\n question = input(\"You: \").strip()\n if question.lower() in {\"exit\", \"quit\", \"bye\"}:\n break\n agent.print_response(question, stream=True)\n"; + +export const PyFrameworksAgnoIngestYoutube = "youtube_url = \"https://www.youtube.com/watch?v=wl6mFyXoxos\"\nvideo_id = extract_video_id(youtube_url)\nytt = YouTubeTranscriptApi()\ntranscript_segments = ytt.fetch(video_id, languages=[\"en\", \"en-US\"]).to_raw_data()\ntranscript_text = \" \".join(segment[\"text\"] for segment in transcript_segments)\n\nknowledge.insert(\n name=f\"YouTube Transcript ({video_id})\",\n text_content=transcript_text,\n metadata={\"source\": \"youtube\", \"video_id\": video_id, \"video_url\": youtube_url},\n)\n"; + +export const PyFrameworksAgnoSetup = "import os\nimport re\n\nfrom agno.agent import Agent\nfrom agno.knowledge.embedder.openai import OpenAIEmbedder\nfrom agno.knowledge.knowledge import Knowledge\nfrom agno.models.openai import OpenAIResponses\nfrom agno.vectordb.lancedb import LanceDb, SearchType\nfrom youtube_transcript_api import YouTubeTranscriptApi\n\nif \"OPENAI_API_KEY\" not in os.environ:\n os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n\ndef extract_video_id(youtube_url: str) -> str:\n match = re.search(r\"(?<=v=)[\\w-]+\", youtube_url) or re.search(\n r\"(?<=be/)[\\w-]+\", youtube_url\n )\n if not match:\n raise ValueError(\"Could not parse YouTube video ID from URL\")\n return match.group(0)\n\nknowledge = Knowledge(\n vector_db=LanceDb(\n uri=\"./tmp/lancedb\",\n table_name=\"youtube_transcripts\",\n search_type=SearchType.hybrid,\n use_tantivy=False,\n embedder=OpenAIEmbedder(id=\"text-embedding-3-small\"),\n ),\n)\n"; + export const PyFrameworksLangchainAddImages = "image_uris = [\"./assets/image-1.png\", \"./assets/image-2.png\"]\nvector_store.add_images(uris=image_uris)\n# here image_uris are local fs paths to the images.\n"; export const PyFrameworksLangchainAddTexts = "vector_store.add_texts(texts=[\"test_123\"], metadatas=[{\"source\": \"wiki\"}])\n\n# Additionaly, to explore the table you can load it into a df or save it in a csv file:\n\ntbl = vector_store.get_table()\nprint(\"tbl:\", tbl)\npd_df = tbl.to_pandas()\npd_df.to_csv(\"docsearch.csv\", index=False)\n\n# you can also create a new vector store object using an older connection object:\nvector_store = LanceDB(connection=tbl, embedding=embeddings)\n"; @@ -104,26 +112,6 @@ export const PyPlatformsPandasImports = "import asyncio\nimport tempfile\nfrom p export const PyPlatformsPandasVectorSearch = "pandas_results = (\n pandas_table.search([0.9, 0.1, 0.3])\n .select([\"text\", \"_distance\"])\n .limit(1)\n .to_pandas()\n)\nprint(pandas_results)\n"; -export const PyPlatformsPhidataCliChat = "assistant.print_response(\"Ask me about something from the knowledge base\")\nwhile True:\n message = Prompt.ask(f\"[bold] :sunglasses: User [/bold]\")\n if message in (\"exit\", \"bye\"):\n break\n assistant.print_response(message, markdown=True)\n"; - -export const PyPlatformsPhidataDocumentModel = "from typing import Any, Dict, List, Optional\n\nfrom pydantic import BaseModel\n\nclass Document(BaseModel):\n \"\"\"Model for managing a document\"\"\"\n\n content: str # <--- here data of chunk is stored\n id: Optional[str] = None\n name: Optional[str] = None\n meta_data: Dict[str, Any] = {}\n embedder: Optional[\"Embedder\"] = None\n embedding: Optional[List[float]] = None\n usage: Optional[Dict[str, Any]] = None\n"; - -export const PyPlatformsPhidataLoadKnowledgeBase = "assistant.knowledge_base.load(recreate=False)\n"; - -export const PyPlatformsPhidataOllamaAssistant = "# define an assistant with llama3.1 llm and reference to the knowledge base created above\nassistant = Assistant(\n llm=Ollama(model=\"llama3.1\"),\n description=\"\"\"You are an Expert in explaining youtube video transcripts. You are a bot that takes transcript of a video and answer the question based on it.\n\n This is transcript for the above timestamp: {relevant_document}\n The user input is: {user_input}\n generate highlights only when asked.\n When asked to generate highlights from the video, understand the context for each timestamp and create key highlight points, answer in following way -\n [timestamp] - highlight 1\n [timestamp] - highlight 2\n ... so on\n\n Your task is to understand the user question, and provide an answer using the provided contexts. Your answers are correct, high-quality, and written by an domain expert. If the provided context does not contain the answer, simply state,'The provided context does not have the answer.'\"\"\",\n knowledge_base=knowledge_base,\n add_references_to_prompt=True,\n)\n"; - -export const PyPlatformsPhidataOllamaKnowledgeBase = "# Create knowledge Base with OllamaEmbedder in LanceDB\nknowledge_base = TextKnowledgeBase(\n path=\"transcript.txt\",\n vector_db=LanceDb(\n embedder=OllamaEmbedder(model=\"nomic-embed-text\", dimensions=768),\n table_name=\"transcript_documents\",\n uri=\"./t2mp/.lancedb\",\n ),\n num_documents=10,\n)\n"; - -export const PyPlatformsPhidataOllamaSetup = "from phi.assistant import Assistant\nfrom phi.embedder.ollama import OllamaEmbedder\nfrom phi.knowledge.text import TextKnowledgeBase\nfrom phi.llm.ollama import Ollama\nfrom phi.vectordb.lancedb import LanceDb\nfrom rich.prompt import Prompt\nfrom transcript import extract_transcript\n\n# The code below creates a file \"transcript.txt\" in the directory, the txt file will be used below\nyoutube_url = \"https://www.youtube.com/watch?v=Xs33-Gzl8Mo\"\nsegment_duration = 20\ntranscript_text, dict_transcript = extract_transcript(youtube_url, segment_duration)\n"; - -export const PyPlatformsPhidataOpenaiAssistant = "# define an assistant with gpt-4o-mini llm and reference to the knowledge base created above\nassistant = Assistant(\n llm=OpenAIChat(\n model=\"gpt-4o-mini\",\n max_tokens=1000,\n temperature=0.3,\n api_key=openai.api_key,\n ),\n description=\"\"\"You are an Expert in explaining youtube video transcripts. You are a bot that takes transcript of a video and answer the question based on it.\n\n This is transcript for the above timestamp: {relevant_document}\n The user input is: {user_input}\n generate highlights only when asked.\n When asked to generate highlights from the video, understand the context for each timestamp and create key highlight points, answer in following way -\n [timestamp] - highlight 1\n [timestamp] - highlight 2\n ... so on\n\n Your task is to understand the user question, and provide an answer using the provided contexts. Your answers are correct, high-quality, and written by an domain expert. If the provided context does not contain the answer, simply state,'The provided context does not have the answer.'\"\"\",\n knowledge_base=knowledge_base,\n add_references_to_prompt=True,\n)\n"; - -export const PyPlatformsPhidataOpenaiKnowledgeBase = "# Create knowledge Base with OpenAIEmbedder in LanceDB\nknowledge_base = TextKnowledgeBase(\n path=\"transcript.txt\",\n vector_db=LanceDb(\n embedder=OpenAIEmbedder(api_key=openai.api_key),\n table_name=\"transcript_documents\",\n uri=\"./t3mp/.lancedb\",\n ),\n num_documents=10,\n)\n"; - -export const PyPlatformsPhidataOpenaiSetup = "import os\n\nimport openai\nfrom phi.assistant import Assistant\nfrom phi.embedder.openai import OpenAIEmbedder\nfrom phi.knowledge.text import TextKnowledgeBase\nfrom phi.llm.openai import OpenAIChat\nfrom phi.vectordb.lancedb import LanceDb\nfrom rich.prompt import Prompt\nfrom transcript import extract_transcript\n\nif \"OPENAI_API_KEY\" not in os.environ:\n # OR set the key here as a variable\n openai.api_key = \"sk-...\"\n\n# The code below creates a file \"transcript.txt\" in the directory, the txt file will be used below\nyoutube_url = \"https://www.youtube.com/watch?v=Xs33-Gzl8Mo\"\nsegment_duration = 20\ntranscript_text, dict_transcript = extract_transcript(youtube_url, segment_duration)\n"; - -export const PyPlatformsPhidataTranscriptModule = "import re\n\nfrom youtube_transcript_api import YouTubeTranscriptApi\n\ndef smodify(seconds):\n hours, remainder = divmod(seconds, 3600)\n minutes, seconds = divmod(remainder, 60)\n return f\"{int(hours):02}:{int(minutes):02}:{int(seconds):02}\"\n\ndef extract_transcript(youtube_url, segment_duration):\n # Extract video ID from the URL\n video_id = re.search(r\"(?<=v=)[\\w-]+\", youtube_url)\n if not video_id:\n video_id = re.search(r\"(?<=be/)[\\w-]+\", youtube_url)\n if not video_id:\n return None\n\n video_id = video_id.group(0)\n\n # Attempt to fetch the transcript\n try:\n # Try to get the official transcript\n transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[\"en\"])\n except Exception:\n # If no official transcript is found, try to get auto-generated transcript\n try:\n transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)\n for transcript in transcript_list:\n transcript = transcript.translate(\"en\").fetch()\n except Exception:\n return None\n\n # Format the transcript into 120s chunks\n transcript_text, dict_transcript = format_transcript(\n transcript, segment_duration\n )\n # Open the file in write mode, which creates it if it doesn't exist\n with open(\"transcript.txt\", \"w\", encoding=\"utf-8\") as file:\n file.write(transcript_text)\n return transcript_text, dict_transcript\n\ndef format_transcript(transcript, segment_duration):\n chunked_transcript = []\n chunk_dict = []\n current_chunk = []\n current_time = 0\n # 2 minutes in seconds\n start_time_chunk = 0 # To track the start time of the current chunk\n\n for segment in transcript:\n start_time = segment[\"start\"]\n end_time_x = start_time + segment[\"duration\"]\n text = segment[\"text\"]\n\n # Add text to the current chunk\n current_chunk.append(text)\n\n # Update the current time with the duration of the current segment\n # The duration of the current segment is given by segment['start'] - start_time_chunk\n if current_chunk:\n current_time = start_time - start_time_chunk\n\n # If current chunk duration reaches or exceeds 2 minutes, save the chunk\n if current_time >= segment_duration:\n # Use the start time of the first segment in the current chunk as the timestamp\n chunked_transcript.append(\n f\"[{smodify(start_time_chunk)} to {smodify(end_time_x)}] \"\n + \" \".join(current_chunk)\n )\n current_chunk = re.sub(\n r\"[\\xa0\\n]\",\n lambda x: \"\" if x.group() == \"\\xa0\" else \" \",\n \"\\n\".join(current_chunk),\n )\n chunk_dict.append(\n {\n \"timestamp\": f\"[{smodify(start_time_chunk)} to {smodify(end_time_x)}]\",\n \"text\": \"\".join(current_chunk),\n }\n )\n current_chunk = [] # Reset the chunk\n start_time_chunk = (\n start_time + segment[\"duration\"]\n ) # Update the start time for the next chunk\n current_time = 0 # Reset current time\n\n # Add any remaining text in the last chunk\n if current_chunk:\n chunked_transcript.append(\n f\"[{smodify(start_time_chunk)} to {smodify(end_time_x)}] \"\n + \" \".join(current_chunk)\n )\n current_chunk = re.sub(\n r\"[\\xa0\\n]\",\n lambda x: \"\" if x.group() == \"\\xa0\" else \" \",\n \"\\n\".join(current_chunk),\n )\n chunk_dict.append(\n {\n \"timestamp\": f\"[{smodify(start_time_chunk)} to {smodify(end_time_x)}]\",\n \"text\": \"\".join(current_chunk),\n }\n )\n\n return \"\\n\\n\".join(chunked_transcript), chunk_dict\n"; - export const PyPlatformsPolarsCreateTable = "birds = pl.DataFrame(\n {\n \"text\": [\"phoenix\", \"sparrow\"],\n \"vector\": [\n [0.1, 0.2, 0.3],\n [0.8, 0.6, 0.5],\n ],\n }\n)\npolars_db = lancedb.connect(str(Path(tempfile.mkdtemp()) / \"polars-demo\"))\npolars_table = polars_db.create_table(\n \"birds\", data=birds.to_arrow(), mode=\"overwrite\"\n)\n"; export const PyPlatformsPolarsImports = "import tempfile\nfrom pathlib import Path\n\nimport lancedb\nimport polars as pl\nfrom lancedb.pydantic import LanceModel, Vector\n"; @@ -160,8 +148,6 @@ export const PyRerankingJinaUsage = "import os\n\nimport lancedb\nfrom lancedb.e export const PyRerankingLinearCombinationUsage = "import lancedb\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, Vector\nfrom lancedb.rerankers import LinearCombinationReranker\n\nembedder = get_registry().get(\"sentence-transformers\").create()\ndb = lancedb.connect(\"~/.lancedb\")\n\nclass Schema(LanceModel):\n text: str = embedder.SourceField()\n vector: Vector(embedder.ndims()) = embedder.VectorField()\n\ndata = [\n {\"text\": \"hello world\"},\n {\"text\": \"goodbye world\"},\n]\ntbl = db.create_table(\"test\", schema=Schema, mode=\"overwrite\")\ntbl.add(data)\nreranker = LinearCombinationReranker()\n\n# Run hybrid search with a reranker\ntbl.create_fts_index(\"text\", replace=True)\nresult = (\n tbl.search(\"hello\", query_type=\"hybrid\").rerank(reranker=reranker).to_list()\n)\n"; -export const PyRerankingMrrMultivector = "import lancedb\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, Vector\nfrom lancedb.rerankers import MRRReranker\n\nembedder = get_registry().get(\"sentence-transformers\").create()\ndb = lancedb.connect(\"~/.lancedb\")\n\nclass Schema(LanceModel):\n text: str = embedder.SourceField()\n meta: str = embedder.SourceField()\n vector: Vector(embedder.ndims()) = embedder.VectorField()\n meta_vector: Vector(embedder.ndims()) = embedder.VectorField(source_column=\"meta\")\n\ndata = [\n {\"text\": \"hello world\", \"meta\": \"greeting message\"},\n {\"text\": \"goodbye world\", \"meta\": \"farewell message\"},\n]\ntbl = db.create_table(\"test\", schema=Schema, mode=\"overwrite\")\ntbl.add(data)\n\n# Search across multiple vector columns and collect results with row IDs\nquery = \"hello\"\nrs1 = tbl.search(query, vector_column_name=\"vector\").limit(10).with_row_id(True).to_arrow()\nrs2 = tbl.search(query, vector_column_name=\"meta_vector\").limit(10).with_row_id(True).to_arrow()\n\n# Rerank the combined results using MRR\nreranker = MRRReranker()\ncombined_results = reranker.rerank_multivector([rs1, rs2])\n"; - export const PyRerankingMrrUsage = "import lancedb\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, Vector\nfrom lancedb.rerankers import MRRReranker\n\nembedder = get_registry().get(\"sentence-transformers\").create()\ndb = lancedb.connect(\"~/.lancedb\")\n\nclass Schema(LanceModel):\n text: str = embedder.SourceField()\n vector: Vector(embedder.ndims()) = embedder.VectorField()\n\ndata = [\n {\"text\": \"hello world\"},\n {\"text\": \"goodbye world\"},\n]\ntbl = db.create_table(\"test\", schema=Schema, mode=\"overwrite\")\ntbl.add(data)\nreranker = MRRReranker(weight_vector=0.7, weight_fts=0.3)\n\n# Run hybrid search with a reranker\ntbl.create_fts_index(\"text\", replace=True)\nresult = (\n tbl.search(\"hello\", query_type=\"hybrid\").rerank(reranker=reranker).to_list()\n)\n\n# Run multivector search across multiple vector columns\nrs1 = tbl.search(\"hello\").limit(10).with_row_id(True).to_arrow()\nrs2 = tbl.search(\"greeting\").limit(10).with_row_id(True).to_arrow()\ncombined = MRRReranker().rerank_multivector([rs1, rs2])\n"; export const PyRerankingOpenaiUsage = "import lancedb\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, Vector\nfrom lancedb.rerankers import OpenaiReranker\n\nembedder = get_registry().get(\"sentence-transformers\").create()\ndb = lancedb.connect(\"~/.lancedb\")\n\nclass Schema(LanceModel):\n text: str = embedder.SourceField()\n vector: Vector(embedder.ndims()) = embedder.VectorField()\n\ndata = [\n {\"text\": \"hello world\"},\n {\"text\": \"goodbye world\"},\n]\ntbl = db.create_table(\"test\", schema=Schema, mode=\"overwrite\")\ntbl.add(data)\nreranker = OpenaiReranker()\n\n# Run vector search with a reranker\nresult = tbl.search(\"hello\").rerank(reranker=reranker).to_list()\n\n# Run FTS search with a reranker\nresult = tbl.search(\"hello\", query_type=\"fts\").rerank(reranker=reranker).to_list()\n\n# Run hybrid search with a reranker\ntbl.create_fts_index(\"text\", replace=True)\nresult = (\n tbl.search(\"hello\", query_type=\"hybrid\").rerank(reranker=reranker).to_list()\n)\n"; diff --git a/tests/py/test_integrations.py b/tests/py/test_integrations.py index 43f408e..f923a76 100644 --- a/tests/py/test_integrations.py +++ b/tests/py/test_integrations.py @@ -1399,273 +1399,79 @@ def test_platforms_duckdb_examples() -> None: # --8<-- [end:platforms_duckdb_mean_price] -def test_platforms_phidata_transcript_module() -> None: - require_flag("RUN_PHIDATA_SNIPPETS") +def test_frameworks_agno_openai_examples() -> None: + require_flag("RUN_AGNO_SNIPPETS") + pytest.importorskip("agno") pytest.importorskip("youtube_transcript_api") - # --8<-- [start:platforms_phidata_transcript_module] + # --8<-- [start:frameworks_agno_setup] + import os import re + from agno.agent import Agent + from agno.knowledge.embedder.openai import OpenAIEmbedder + from agno.knowledge.knowledge import Knowledge + from agno.models.openai import OpenAIResponses + from agno.vectordb.lancedb import LanceDb, SearchType from youtube_transcript_api import YouTubeTranscriptApi - def smodify(seconds): - hours, remainder = divmod(seconds, 3600) - minutes, seconds = divmod(remainder, 60) - return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}" - - def extract_transcript(youtube_url, segment_duration): - # Extract video ID from the URL - video_id = re.search(r"(?<=v=)[\w-]+", youtube_url) - if not video_id: - video_id = re.search(r"(?<=be/)[\w-]+", youtube_url) - if not video_id: - return None - - video_id = video_id.group(0) - - # Attempt to fetch the transcript - try: - # Try to get the official transcript - transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"]) - except Exception: - # If no official transcript is found, try to get auto-generated transcript - try: - transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) - for transcript in transcript_list: - transcript = transcript.translate("en").fetch() - except Exception: - return None - - # Format the transcript into 120s chunks - transcript_text, dict_transcript = format_transcript( - transcript, segment_duration - ) - # Open the file in write mode, which creates it if it doesn't exist - with open("transcript.txt", "w", encoding="utf-8") as file: - file.write(transcript_text) - return transcript_text, dict_transcript - - def format_transcript(transcript, segment_duration): - chunked_transcript = [] - chunk_dict = [] - current_chunk = [] - current_time = 0 - # 2 minutes in seconds - start_time_chunk = 0 # To track the start time of the current chunk - - for segment in transcript: - start_time = segment["start"] - end_time_x = start_time + segment["duration"] - text = segment["text"] - - # Add text to the current chunk - current_chunk.append(text) - - # Update the current time with the duration of the current segment - # The duration of the current segment is given by segment['start'] - start_time_chunk - if current_chunk: - current_time = start_time - start_time_chunk - - # If current chunk duration reaches or exceeds 2 minutes, save the chunk - if current_time >= segment_duration: - # Use the start time of the first segment in the current chunk as the timestamp - chunked_transcript.append( - f"[{smodify(start_time_chunk)} to {smodify(end_time_x)}] " - + " ".join(current_chunk) - ) - current_chunk = re.sub( - r"[\xa0\n]", - lambda x: "" if x.group() == "\xa0" else " ", - "\n".join(current_chunk), - ) - chunk_dict.append( - { - "timestamp": f"[{smodify(start_time_chunk)} to {smodify(end_time_x)}]", - "text": "".join(current_chunk), - } - ) - current_chunk = [] # Reset the chunk - start_time_chunk = ( - start_time + segment["duration"] - ) # Update the start time for the next chunk - current_time = 0 # Reset current time - - # Add any remaining text in the last chunk - if current_chunk: - chunked_transcript.append( - f"[{smodify(start_time_chunk)} to {smodify(end_time_x)}] " - + " ".join(current_chunk) - ) - current_chunk = re.sub( - r"[\xa0\n]", - lambda x: "" if x.group() == "\xa0" else " ", - "\n".join(current_chunk), - ) - chunk_dict.append( - { - "timestamp": f"[{smodify(start_time_chunk)} to {smodify(end_time_x)}]", - "text": "".join(current_chunk), - } - ) - - return "\n\n".join(chunked_transcript), chunk_dict - - # --8<-- [end:platforms_phidata_transcript_module] - - -def test_platforms_phidata_openai_examples() -> None: - require_flag("RUN_PHIDATA_SNIPPETS") - pytest.importorskip("phi") - pytest.importorskip("openai") - pytest.importorskip("rich") - - # --8<-- [start:platforms_phidata_openai_setup] - import os + if "OPENAI_API_KEY" not in os.environ: + os.environ["OPENAI_API_KEY"] = "sk-..." - import openai - from phi.assistant import Assistant - from phi.embedder.openai import OpenAIEmbedder - from phi.knowledge.text import TextKnowledgeBase - from phi.llm.openai import OpenAIChat - from phi.vectordb.lancedb import LanceDb - from rich.prompt import Prompt - from transcript import extract_transcript + def extract_video_id(youtube_url: str) -> str: + match = re.search(r"(?<=v=)[\w-]+", youtube_url) or re.search( + r"(?<=be/)[\w-]+", youtube_url + ) + if not match: + raise ValueError("Could not parse YouTube video ID from URL") + return match.group(0) - if "OPENAI_API_KEY" not in os.environ: - # OR set the key here as a variable - openai.api_key = "sk-..." - - # The code below creates a file "transcript.txt" in the directory, the txt file will be used below - youtube_url = "https://www.youtube.com/watch?v=Xs33-Gzl8Mo" - segment_duration = 20 - transcript_text, dict_transcript = extract_transcript(youtube_url, segment_duration) - # --8<-- [end:platforms_phidata_openai_setup] - - # --8<-- [start:platforms_phidata_openai_knowledge_base] - # Create knowledge Base with OpenAIEmbedder in LanceDB - knowledge_base = TextKnowledgeBase( - path="transcript.txt", + knowledge = Knowledge( vector_db=LanceDb( - embedder=OpenAIEmbedder(api_key=openai.api_key), - table_name="transcript_documents", - uri="./t3mp/.lancedb", + uri="./tmp/lancedb", + table_name="youtube_transcripts", + search_type=SearchType.hybrid, + use_tantivy=False, + embedder=OpenAIEmbedder(id="text-embedding-3-small"), ), - num_documents=10, ) - # --8<-- [end:platforms_phidata_openai_knowledge_base] - - # --8<-- [start:platforms_phidata_openai_assistant] - # define an assistant with gpt-4o-mini llm and reference to the knowledge base created above - assistant = Assistant( - llm=OpenAIChat( - model="gpt-4o-mini", - max_tokens=1000, - temperature=0.3, - api_key=openai.api_key, - ), - description="""You are an Expert in explaining youtube video transcripts. You are a bot that takes transcript of a video and answer the question based on it. - - This is transcript for the above timestamp: {relevant_document} - The user input is: {user_input} - generate highlights only when asked. - When asked to generate highlights from the video, understand the context for each timestamp and create key highlight points, answer in following way - - [timestamp] - highlight 1 - [timestamp] - highlight 2 - ... so on - - Your task is to understand the user question, and provide an answer using the provided contexts. Your answers are correct, high-quality, and written by an domain expert. If the provided context does not contain the answer, simply state,'The provided context does not have the answer.'""", - knowledge_base=knowledge_base, - add_references_to_prompt=True, + # --8<-- [end:frameworks_agno_setup] + + # --8<-- [start:frameworks_agno_ingest_youtube] + youtube_url = "https://www.youtube.com/watch?v=wl6mFyXoxos" + video_id = extract_video_id(youtube_url) + ytt = YouTubeTranscriptApi() + transcript_segments = ytt.fetch(video_id, languages=["en", "en-US"]).to_raw_data() + transcript_text = " ".join(segment["text"] for segment in transcript_segments) + + knowledge.insert( + name=f"YouTube Transcript ({video_id})", + text_content=transcript_text, + metadata={"source": "youtube", "video_id": video_id, "video_url": youtube_url}, ) - # --8<-- [end:platforms_phidata_openai_assistant] - - # --8<-- [start:platforms_phidata_load_knowledge_base] - assistant.knowledge_base.load(recreate=False) - # --8<-- [end:platforms_phidata_load_knowledge_base] + # --8<-- [end:frameworks_agno_ingest_youtube] + + # --8<-- [start:frameworks_agno_agent] + agent = Agent( + model=OpenAIResponses(id="gpt-5-mini"), + knowledge=knowledge, + search_knowledge=True, + instructions="Search the transcript and answer only from retrieved context.", + markdown=True, + ) + # --8<-- [end:frameworks_agno_agent] - # --8<-- [start:platforms_phidata_cli_chat] - assistant.print_response("Ask me about something from the knowledge base") + # --8<-- [start:frameworks_agno_cli_chat] + agent.print_response( + "Summarize the loaded video transcript in 5 concise bullet points.", + stream=True, + ) while True: - message = Prompt.ask(f"[bold] :sunglasses: User [/bold]") - if message in ("exit", "bye"): + question = input("You: ").strip() + if question.lower() in {"exit", "quit", "bye"}: break - assistant.print_response(message, markdown=True) - # --8<-- [end:platforms_phidata_cli_chat] - - -def test_platforms_phidata_ollama_examples() -> None: - require_flag("RUN_PHIDATA_SNIPPETS") - pytest.importorskip("phi") - - # --8<-- [start:platforms_phidata_ollama_setup] - from phi.assistant import Assistant - from phi.embedder.ollama import OllamaEmbedder - from phi.knowledge.text import TextKnowledgeBase - from phi.llm.ollama import Ollama - from phi.vectordb.lancedb import LanceDb - from rich.prompt import Prompt - from transcript import extract_transcript - - # The code below creates a file "transcript.txt" in the directory, the txt file will be used below - youtube_url = "https://www.youtube.com/watch?v=Xs33-Gzl8Mo" - segment_duration = 20 - transcript_text, dict_transcript = extract_transcript(youtube_url, segment_duration) - # --8<-- [end:platforms_phidata_ollama_setup] - - # --8<-- [start:platforms_phidata_ollama_knowledge_base] - # Create knowledge Base with OllamaEmbedder in LanceDB - knowledge_base = TextKnowledgeBase( - path="transcript.txt", - vector_db=LanceDb( - embedder=OllamaEmbedder(model="nomic-embed-text", dimensions=768), - table_name="transcript_documents", - uri="./t2mp/.lancedb", - ), - num_documents=10, - ) - # --8<-- [end:platforms_phidata_ollama_knowledge_base] - - # --8<-- [start:platforms_phidata_ollama_assistant] - # define an assistant with llama3.1 llm and reference to the knowledge base created above - assistant = Assistant( - llm=Ollama(model="llama3.1"), - description="""You are an Expert in explaining youtube video transcripts. You are a bot that takes transcript of a video and answer the question based on it. - - This is transcript for the above timestamp: {relevant_document} - The user input is: {user_input} - generate highlights only when asked. - When asked to generate highlights from the video, understand the context for each timestamp and create key highlight points, answer in following way - - [timestamp] - highlight 1 - [timestamp] - highlight 2 - ... so on - - Your task is to understand the user question, and provide an answer using the provided contexts. Your answers are correct, high-quality, and written by an domain expert. If the provided context does not contain the answer, simply state,'The provided context does not have the answer.'""", - knowledge_base=knowledge_base, - add_references_to_prompt=True, - ) - # --8<-- [end:platforms_phidata_ollama_assistant] - - -def test_platforms_phidata_document_model() -> None: - require_flag("RUN_PHIDATA_SNIPPETS") - - # --8<-- [start:platforms_phidata_document_model] - from typing import Any, Dict, List, Optional - - from pydantic import BaseModel - - class Document(BaseModel): - """Model for managing a document""" - - content: str # <--- here data of chunk is stored - id: Optional[str] = None - name: Optional[str] = None - meta_data: Dict[str, Any] = {} - embedder: Optional["Embedder"] = None - embedding: Optional[List[float]] = None - usage: Optional[Dict[str, Any]] = None - - # --8<-- [end:platforms_phidata_document_model] + agent.print_response(question, stream=True) + # --8<-- [end:frameworks_agno_cli_chat] def test_platforms_voxel51_examples() -> None: