Note: this integration is still using v0 version of the Firecrawl API. You can install the 0.0.20 version for the Python SDK or the 0.0.36 for the Node SDK.

Installation

pip install firecrawl-py==0.0.20 llama_index llama-index llama-index-readers-web

Usage

Using FireCrawl to Gather an Entire Website

from llama_index.readers.web import FireCrawlWebReader
from llama_index.core import SummaryIndex
import os


# Initialize FireCrawlWebReader to crawl a website
firecrawl_reader = FireCrawlWebReader(
    api_key="<your_api_key>",  # Replace with your actual API key from https://www.firecrawl.dev/
    mode="scrape",  # Choose between "crawl" and "scrape" for single page scraping
    params={"additional": "parameters"}  # Optional additional parameters
)

# Set the environment variable for the virtual key
os.environ["OPENAI_API_KEY"] = "<OPENAI_API_KEY>"

# Load documents from a single page URL
documents = firecrawl_reader.load_data(url="http://paulgraham.com/")
index = SummaryIndex.from_documents(documents)

# Set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")
display(Markdown(f"<b>{response}</b>"))

Using FireCrawl to Gather a Single Page

from llama_index.readers.web import FireCrawlWebReader

# Initialize the FireCrawlWebReader with your API key and desired mode
firecrawl_reader = FireCrawlWebReader(
    api_key="<your_api_key>",  # Replace with your actual API key from https://www.firecrawl.dev/
    mode="scrape",  # Choose between "crawl" and "scrape"
    params={"additional": "parameters"}  # Optional additional parameters
)

# Load documents from a specified URL
documents = firecrawl_reader.load_data(url="http://paulgraham.com/worked.html")
index = SummaryIndex.from_documents(documents)

# Set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")
display(Markdown(f"<b>{response}</b>"))