Skip to content

Commit 9181642

Browse files
committed
Korvus x Firecrawl
0 parents  commit 9181642

File tree

5 files changed

+261
-0
lines changed

5 files changed

+261
-0
lines changed

.env.development

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
CRAWL_URL="postgresml.org"
2+
CRAWL_LIMIT=100
3+
KORVUS_DATABASE_URL=""
4+
FIRECRAWL_API_KEY=""

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
.env
2+
venv

LICENSE

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
Copyright (c) 2024 PostgresML Team
2+
3+
Permission is hereby granted, free of charge, to any person obtaining
4+
a copy of this software and associated documentation files (the
5+
"Software"), to deal in the Software without restriction, including
6+
without limitation the rights to use, copy, modify, merge, publish,
7+
distribute, sublicense, and/or sell copies of the Software, and to
8+
permit persons to whom the Software is furnished to do so, subject to
9+
the following conditions:
10+
11+
The above copyright notice and this permission notice shall be
12+
included in all copies or substantial portions of the Software.
13+
14+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

README.md

+68
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# Korvus x Firecrawl Example
2+
3+
This example application demonstrates how to perform web crawling, semantic search, and Retrieval-Augmented Generation (RAG) using [Korvus](https://github.com/postgresml/korvus) and [Firecrawl](https://www.firecrawl.dev/).
4+
5+
## Features
6+
7+
- Web crawling using Firecrawl
8+
- Semantic search over crawled content
9+
- RAG (Retrieval-Augmented Generation) for question answering
10+
11+
## Prerequisites
12+
13+
- Python 3.7+
14+
- Firecrawl API key
15+
- PostgresML database URL
16+
17+
## Installation
18+
19+
1. Clone this repository:
20+
```
21+
git clone https://github.com/yourusername/korvus-firecrawl-example.git
22+
cd korvus-firecrawl-example
23+
```
24+
25+
2. Install the required packages:
26+
```
27+
pip install korvus firecrawl python-dotenv rich
28+
```
29+
30+
3. Create a `.env` file in the project root and add your credentials:
31+
```
32+
FIRECRAWL_API_KEY=your_firecrawl_api_key
33+
KORVUS_DATABASE_URL=your_postgresml_database_url
34+
CRAWL_URL=https://example.com
35+
CRAWL_LIMIT=100
36+
```
37+
38+
## Usage
39+
40+
The application supports three main actions: crawl, search, and rag.
41+
42+
1. Crawl a website:
43+
```
44+
python main.py crawl
45+
```
46+
47+
2. Perform semantic search:
48+
```
49+
python main.py search
50+
```
51+
52+
3. Use RAG for question answering:
53+
```
54+
python main.py rag
55+
```
56+
57+
For search and RAG, you'll be prompted to enter queries. Type 'q' to quit the input loop.
58+
59+
## How it works
60+
61+
1. The application uses Firecrawl to crawl the specified website and extract markdown content.
62+
2. Crawled data is processed and stored using Korvus.
63+
3. Semantic search allows you to find relevant documents based on your queries.
64+
4. RAG combines retrieved context with a language model to answer questions.
65+
66+
## Contributing
67+
68+
Contributions are welcome! Please feel free to submit a Pull Request.

main.py

+167
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
from korvus import Collection, Pipeline
2+
from firecrawl import FirecrawlApp
3+
import os
4+
import time
5+
import asyncio
6+
from rich import print
7+
from rich.pretty import pprint
8+
from dotenv import load_dotenv
9+
import argparse
10+
11+
12+
# Load variables from our .env file
13+
load_dotenv()
14+
15+
16+
# Configure our program args
17+
parser = argparse.ArgumentParser(description="Example Korvus x Firecrawl")
18+
parser.add_argument(
19+
"action", choices=["crawl", "search", "rag"], help="Action to perform"
20+
)
21+
22+
23+
# Initialize the FirecrawlApp with your API key
24+
firecrawl = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
25+
26+
27+
# Define our Pipeline and Collection
28+
pipeline = Pipeline(
29+
"v0",
30+
{
31+
"markdown": {
32+
"splitter": {"model": "markdown"},
33+
"semantic_search": {
34+
"model": "mixedbread-ai/mxbai-embed-large-v1",
35+
},
36+
},
37+
},
38+
)
39+
collection = Collection(
40+
"korvus-firecrawl-example-0", database_url=os.environ["KORVUS_DATABASE_URL"]
41+
)
42+
43+
44+
# Add our Pipeline to our Collection
45+
async def add_pipeline():
46+
await collection.add_pipeline(pipeline)
47+
48+
49+
# Crawl with Firecrawl
50+
def crawl():
51+
print("Crawling...")
52+
job = firecrawl.crawl_url(
53+
os.environ["CRAWL_URL"],
54+
params={
55+
"limit": int(os.environ["CRAWL_LIMIT"]),
56+
"scrapeOptions": {"formats": ["markdown"]},
57+
},
58+
poll_interval=30,
59+
)
60+
return job
61+
62+
63+
# Do RAG
64+
async def do_rag(user_query):
65+
results = await collection.rag(
66+
{
67+
"CONTEXT": {
68+
"vector_search": {
69+
"query": {
70+
"fields": {
71+
"markdown": {
72+
"query": user_query,
73+
"parameters": {
74+
"prompt": "Represent this sentence for searching relevant passages: "
75+
},
76+
}
77+
},
78+
},
79+
"document": {"keys": ["id"]},
80+
"rerank": {
81+
"model": "mixedbread-ai/mxbai-rerank-base-v1",
82+
"query": user_query,
83+
"num_documents_to_rerank": 100,
84+
},
85+
"limit": 5,
86+
},
87+
"aggregate": {"join": "\n\n\n"},
88+
},
89+
"chat": {
90+
"model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
91+
"messages": [
92+
{
93+
"role": "system",
94+
"content": "You are a question and answering bot. Answer the users question given the context succinctly.",
95+
},
96+
{
97+
"role": "user",
98+
"content": f"Given the context\n\n:{{CONTEXT}}\n\nAnswer the question: {user_query}",
99+
},
100+
],
101+
"max_tokens": 256,
102+
},
103+
},
104+
pipeline,
105+
)
106+
return results
107+
108+
109+
# Do search
110+
async def do_search(user_query):
111+
results = await collection.search(
112+
{
113+
"query": {
114+
"semantic_search": {
115+
"markdown": {
116+
"query": user_query,
117+
},
118+
},
119+
},
120+
"limit": 5,
121+
},
122+
pipeline,
123+
)
124+
return results
125+
126+
127+
# Get user input and call our callback
128+
async def input_loop(callback):
129+
while True:
130+
query = input("Enter your query (or 'q' to quit): ")
131+
if query.lower() == "q":
132+
break
133+
results = await callback(query)
134+
print("\n[bold]Results:[/bold]\n")
135+
pprint(results, max_length=2, max_string=100)
136+
137+
138+
# Our main function
139+
async def main():
140+
args = parser.parse_args()
141+
142+
if args.action == "crawl":
143+
# Add our Pipeline to our Collection
144+
# We only ever need to do this once
145+
# Calling it more than once does nothing
146+
await add_pipeline()
147+
148+
# Crawl the website
149+
results = crawl()
150+
151+
# Construct our documents to upsert
152+
documents = [
153+
{"id": data["metadata"]["sourceURL"], "markdown": data["markdown"]}
154+
for data in results["data"]
155+
]
156+
157+
# Upsert our documents
158+
await collection.upsert_documents(documents)
159+
160+
elif args.action == "rag":
161+
await input_loop(do_rag)
162+
163+
elif args.action == "search":
164+
await input_loop(do_search)
165+
166+
167+
asyncio.run(main())

0 commit comments

Comments
 (0)