Extraer

Extrae datos estructurados de páginas web con LLMs

curl --request POST \
  --url https://api.firecrawl.dev/v2/extract \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "urls": [
    "<string>"
  ],
  "enableWebSearch": false,
  "ignoreInvalidURLs": true,
  "ignoreSitemap": false,
  "includeSubdomains": true,
  "prompt": "<string>",
  "schema": {},
  "scrapeOptions": {
    "actions": [
      {
        "milliseconds": 2,
        "type": "wait"
      }
    ],
    "blockAds": true,
    "excludeTags": [
      "<string>"
    ],
    "formats": [
      "markdown"
    ],
    "headers": {},
    "includeTags": [
      "<string>"
    ],
    "location": {
      "country": "US",
      "languages": [
        "en-US"
      ]
    },
    "lockdown": false,
    "maxAge": 172800000,
    "minAge": 123,
    "mobile": false,
    "onlyCleanContent": false,
    "onlyMainContent": true,
    "parsers": [
      "pdf"
    ],
    "proxy": "auto",
    "redactPII": false,
    "removeBase64Images": true,
    "skipTlsVerification": true,
    "storeInCache": true,
    "threatProtection": {
      "blacklist": [
        "<string>"
      ],
      "blockedTlds": [
        "<string>"
      ],
      "riskScoreThreshold": 75,
      "whitelist": [
        "<string>"
      ]
    },
    "timeout": 60000,
    "waitFor": 0
  },
  "showSources": false,
  "threatProtection": {
    "blacklist": [
      "<string>"
    ],
    "blockedTlds": [
      "<string>"
    ],
    "riskScoreThreshold": 75,
    "whitelist": [
      "<string>"
    ]
  }
}
'

import requests

url = "https://api.firecrawl.dev/v2/extract"

payload = {
    "urls": ["<string>"],
    "enableWebSearch": False,
    "ignoreInvalidURLs": True,
    "ignoreSitemap": False,
    "includeSubdomains": True,
    "prompt": "<string>",
    "schema": {},
    "scrapeOptions": {
        "actions": [
            {
                "milliseconds": 2,
                "type": "wait"
            }
        ],
        "blockAds": True,
        "excludeTags": ["<string>"],
        "formats": ["markdown"],
        "headers": {},
        "includeTags": ["<string>"],
        "location": {
            "country": "US",
            "languages": ["en-US"]
        },
        "lockdown": False,
        "maxAge": 172800000,
        "minAge": 123,
        "mobile": False,
        "onlyCleanContent": False,
        "onlyMainContent": True,
        "parsers": ["pdf"],
        "proxy": "auto",
        "redactPII": False,
        "removeBase64Images": True,
        "skipTlsVerification": True,
        "storeInCache": True,
        "threatProtection": {
            "blacklist": ["<string>"],
            "blockedTlds": ["<string>"],
            "riskScoreThreshold": 75,
            "whitelist": ["<string>"]
        },
        "timeout": 60000,
        "waitFor": 0
    },
    "showSources": False,
    "threatProtection": {
        "blacklist": ["<string>"],
        "blockedTlds": ["<string>"],
        "riskScoreThreshold": 75,
        "whitelist": ["<string>"]
    }
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {Authorization: 'Bearer <token>', 'Content-Type': 'application/json'},
  body: JSON.stringify({
    urls: ['<string>'],
    enableWebSearch: false,
    ignoreInvalidURLs: true,
    ignoreSitemap: false,
    includeSubdomains: true,
    prompt: '<string>',
    schema: {},
    scrapeOptions: {
      actions: [{milliseconds: 2, type: 'wait'}],
      blockAds: true,
      excludeTags: ['<string>'],
      formats: ['markdown'],
      headers: {},
      includeTags: ['<string>'],
      location: {country: 'US', languages: ['en-US']},
      lockdown: false,
      maxAge: 172800000,
      minAge: 123,
      mobile: false,
      onlyCleanContent: false,
      onlyMainContent: true,
      parsers: ['pdf'],
      proxy: 'auto',
      redactPII: false,
      removeBase64Images: true,
      skipTlsVerification: true,
      storeInCache: true,
      threatProtection: {
        blacklist: ['<string>'],
        blockedTlds: ['<string>'],
        riskScoreThreshold: 75,
        whitelist: ['<string>']
      },
      timeout: 60000,
      waitFor: 0
    },
    showSources: false,
    threatProtection: {
      blacklist: ['<string>'],
      blockedTlds: ['<string>'],
      riskScoreThreshold: 75,
      whitelist: ['<string>']
    }
  })
};

fetch('https://api.firecrawl.dev/v2/extract', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.firecrawl.dev/v2/extract",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'urls' => [
        '<string>'
    ],
    'enableWebSearch' => false,
    'ignoreInvalidURLs' => true,
    'ignoreSitemap' => false,
    'includeSubdomains' => true,
    'prompt' => '<string>',
    'schema' => [
        
    ],
    'scrapeOptions' => [
        'actions' => [
                [
                                'milliseconds' => 2,
                                'type' => 'wait'
                ]
        ],
        'blockAds' => true,
        'excludeTags' => [
                '<string>'
        ],
        'formats' => [
                'markdown'
        ],
        'headers' => [
                
        ],
        'includeTags' => [
                '<string>'
        ],
        'location' => [
                'country' => 'US',
                'languages' => [
                                'en-US'
                ]
        ],
        'lockdown' => false,
        'maxAge' => 172800000,
        'minAge' => 123,
        'mobile' => false,
        'onlyCleanContent' => false,
        'onlyMainContent' => true,
        'parsers' => [
                'pdf'
        ],
        'proxy' => 'auto',
        'redactPII' => false,
        'removeBase64Images' => true,
        'skipTlsVerification' => true,
        'storeInCache' => true,
        'threatProtection' => [
                'blacklist' => [
                                '<string>'
                ],
                'blockedTlds' => [
                                '<string>'
                ],
                'riskScoreThreshold' => 75,
                'whitelist' => [
                                '<string>'
                ]
        ],
        'timeout' => 60000,
        'waitFor' => 0
    ],
    'showSources' => false,
    'threatProtection' => [
        'blacklist' => [
                '<string>'
        ],
        'blockedTlds' => [
                '<string>'
        ],
        'riskScoreThreshold' => 75,
        'whitelist' => [
                '<string>'
        ]
    ]
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>",
    "Content-Type: application/json"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://api.firecrawl.dev/v2/extract"

	payload := strings.NewReader("{\n  \"urls\": [\n    \"<string>\"\n  ],\n  \"enableWebSearch\": false,\n  \"ignoreInvalidURLs\": true,\n  \"ignoreSitemap\": false,\n  \"includeSubdomains\": true,\n  \"prompt\": \"<string>\",\n  \"schema\": {},\n  \"scrapeOptions\": {\n    \"actions\": [\n      {\n        \"milliseconds\": 2,\n        \"type\": \"wait\"\n      }\n    ],\n    \"blockAds\": true,\n    \"excludeTags\": [\n      \"<string>\"\n    ],\n    \"formats\": [\n      \"markdown\"\n    ],\n    \"headers\": {},\n    \"includeTags\": [\n      \"<string>\"\n    ],\n    \"location\": {\n      \"country\": \"US\",\n      \"languages\": [\n        \"en-US\"\n      ]\n    },\n    \"lockdown\": false,\n    \"maxAge\": 172800000,\n    \"minAge\": 123,\n    \"mobile\": false,\n    \"onlyCleanContent\": false,\n    \"onlyMainContent\": true,\n    \"parsers\": [\n      \"pdf\"\n    ],\n    \"proxy\": \"auto\",\n    \"redactPII\": false,\n    \"removeBase64Images\": true,\n    \"skipTlsVerification\": true,\n    \"storeInCache\": true,\n    \"threatProtection\": {\n      \"blacklist\": [\n        \"<string>\"\n      ],\n      \"blockedTlds\": [\n        \"<string>\"\n      ],\n      \"riskScoreThreshold\": 75,\n      \"whitelist\": [\n        \"<string>\"\n      ]\n    },\n    \"timeout\": 60000,\n    \"waitFor\": 0\n  },\n  \"showSources\": false,\n  \"threatProtection\": {\n    \"blacklist\": [\n      \"<string>\"\n    ],\n    \"blockedTlds\": [\n      \"<string>\"\n    ],\n    \"riskScoreThreshold\": 75,\n    \"whitelist\": [\n      \"<string>\"\n    ]\n  }\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Authorization", "Bearer <token>")
	req.Header.Add("Content-Type", "application/json")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://api.firecrawl.dev/v2/extract")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"urls\": [\n    \"<string>\"\n  ],\n  \"enableWebSearch\": false,\n  \"ignoreInvalidURLs\": true,\n  \"ignoreSitemap\": false,\n  \"includeSubdomains\": true,\n  \"prompt\": \"<string>\",\n  \"schema\": {},\n  \"scrapeOptions\": {\n    \"actions\": [\n      {\n        \"milliseconds\": 2,\n        \"type\": \"wait\"\n      }\n    ],\n    \"blockAds\": true,\n    \"excludeTags\": [\n      \"<string>\"\n    ],\n    \"formats\": [\n      \"markdown\"\n    ],\n    \"headers\": {},\n    \"includeTags\": [\n      \"<string>\"\n    ],\n    \"location\": {\n      \"country\": \"US\",\n      \"languages\": [\n        \"en-US\"\n      ]\n    },\n    \"lockdown\": false,\n    \"maxAge\": 172800000,\n    \"minAge\": 123,\n    \"mobile\": false,\n    \"onlyCleanContent\": false,\n    \"onlyMainContent\": true,\n    \"parsers\": [\n      \"pdf\"\n    ],\n    \"proxy\": \"auto\",\n    \"redactPII\": false,\n    \"removeBase64Images\": true,\n    \"skipTlsVerification\": true,\n    \"storeInCache\": true,\n    \"threatProtection\": {\n      \"blacklist\": [\n        \"<string>\"\n      ],\n      \"blockedTlds\": [\n        \"<string>\"\n      ],\n      \"riskScoreThreshold\": 75,\n      \"whitelist\": [\n        \"<string>\"\n      ]\n    },\n    \"timeout\": 60000,\n    \"waitFor\": 0\n  },\n  \"showSources\": false,\n  \"threatProtection\": {\n    \"blacklist\": [\n      \"<string>\"\n    ],\n    \"blockedTlds\": [\n      \"<string>\"\n    ],\n    \"riskScoreThreshold\": 75,\n    \"whitelist\": [\n      \"<string>\"\n    ]\n  }\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://api.firecrawl.dev/v2/extract")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Authorization"] = 'Bearer <token>'
request["Content-Type"] = 'application/json'
request.body = "{\n  \"urls\": [\n    \"<string>\"\n  ],\n  \"enableWebSearch\": false,\n  \"ignoreInvalidURLs\": true,\n  \"ignoreSitemap\": false,\n  \"includeSubdomains\": true,\n  \"prompt\": \"<string>\",\n  \"schema\": {},\n  \"scrapeOptions\": {\n    \"actions\": [\n      {\n        \"milliseconds\": 2,\n        \"type\": \"wait\"\n      }\n    ],\n    \"blockAds\": true,\n    \"excludeTags\": [\n      \"<string>\"\n    ],\n    \"formats\": [\n      \"markdown\"\n    ],\n    \"headers\": {},\n    \"includeTags\": [\n      \"<string>\"\n    ],\n    \"location\": {\n      \"country\": \"US\",\n      \"languages\": [\n        \"en-US\"\n      ]\n    },\n    \"lockdown\": false,\n    \"maxAge\": 172800000,\n    \"minAge\": 123,\n    \"mobile\": false,\n    \"onlyCleanContent\": false,\n    \"onlyMainContent\": true,\n    \"parsers\": [\n      \"pdf\"\n    ],\n    \"proxy\": \"auto\",\n    \"redactPII\": false,\n    \"removeBase64Images\": true,\n    \"skipTlsVerification\": true,\n    \"storeInCache\": true,\n    \"threatProtection\": {\n      \"blacklist\": [\n        \"<string>\"\n      ],\n      \"blockedTlds\": [\n        \"<string>\"\n      ],\n      \"riskScoreThreshold\": 75,\n      \"whitelist\": [\n        \"<string>\"\n      ]\n    },\n    \"timeout\": 60000,\n    \"waitFor\": 0\n  },\n  \"showSources\": false,\n  \"threatProtection\": {\n    \"blacklist\": [\n      \"<string>\"\n    ],\n    \"blockedTlds\": [\n      \"<string>\"\n    ],\n    \"riskScoreThreshold\": 75,\n    \"whitelist\": [\n      \"<string>\"\n    ]\n  }\n}"

response = http.request(request)
puts response.read_body

{
  "id": "<string>",
  "invalidURLs": [
    "<string>"
  ],
  "success": true
}

{
  "error": "Invalid input data."
}

{
  "error": "An unexpected error occurred on the server."
}

POST

extract

Extrae datos estructurados de páginas web con LLMs

curl --request POST \
  --url https://api.firecrawl.dev/v2/extract \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "urls": [
    "<string>"
  ],
  "enableWebSearch": false,
  "ignoreInvalidURLs": true,
  "ignoreSitemap": false,
  "includeSubdomains": true,
  "prompt": "<string>",
  "schema": {},
  "scrapeOptions": {
    "actions": [
      {
        "milliseconds": 2,
        "type": "wait"
      }
    ],
    "blockAds": true,
    "excludeTags": [
      "<string>"
    ],
    "formats": [
      "markdown"
    ],
    "headers": {},
    "includeTags": [
      "<string>"
    ],
    "location": {
      "country": "US",
      "languages": [
        "en-US"
      ]
    },
    "lockdown": false,
    "maxAge": 172800000,
    "minAge": 123,
    "mobile": false,
    "onlyCleanContent": false,
    "onlyMainContent": true,
    "parsers": [
      "pdf"
    ],
    "proxy": "auto",
    "redactPII": false,
    "removeBase64Images": true,
    "skipTlsVerification": true,
    "storeInCache": true,
    "threatProtection": {
      "blacklist": [
        "<string>"
      ],
      "blockedTlds": [
        "<string>"
      ],
      "riskScoreThreshold": 75,
      "whitelist": [
        "<string>"
      ]
    },
    "timeout": 60000,
    "waitFor": 0
  },
  "showSources": false,
  "threatProtection": {
    "blacklist": [
      "<string>"
    ],
    "blockedTlds": [
      "<string>"
    ],
    "riskScoreThreshold": 75,
    "whitelist": [
      "<string>"
    ]
  }
}
'

import requests

url = "https://api.firecrawl.dev/v2/extract"

payload = {
    "urls": ["<string>"],
    "enableWebSearch": False,
    "ignoreInvalidURLs": True,
    "ignoreSitemap": False,
    "includeSubdomains": True,
    "prompt": "<string>",
    "schema": {},
    "scrapeOptions": {
        "actions": [
            {
                "milliseconds": 2,
                "type": "wait"
            }
        ],
        "blockAds": True,
        "excludeTags": ["<string>"],
        "formats": ["markdown"],
        "headers": {},
        "includeTags": ["<string>"],
        "location": {
            "country": "US",
            "languages": ["en-US"]
        },
        "lockdown": False,
        "maxAge": 172800000,
        "minAge": 123,
        "mobile": False,
        "onlyCleanContent": False,
        "onlyMainContent": True,
        "parsers": ["pdf"],
        "proxy": "auto",
        "redactPII": False,
        "removeBase64Images": True,
        "skipTlsVerification": True,
        "storeInCache": True,
        "threatProtection": {
            "blacklist": ["<string>"],
            "blockedTlds": ["<string>"],
            "riskScoreThreshold": 75,
            "whitelist": ["<string>"]
        },
        "timeout": 60000,
        "waitFor": 0
    },
    "showSources": False,
    "threatProtection": {
        "blacklist": ["<string>"],
        "blockedTlds": ["<string>"],
        "riskScoreThreshold": 75,
        "whitelist": ["<string>"]
    }
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {Authorization: 'Bearer <token>', 'Content-Type': 'application/json'},
  body: JSON.stringify({
    urls: ['<string>'],
    enableWebSearch: false,
    ignoreInvalidURLs: true,
    ignoreSitemap: false,
    includeSubdomains: true,
    prompt: '<string>',
    schema: {},
    scrapeOptions: {
      actions: [{milliseconds: 2, type: 'wait'}],
      blockAds: true,
      excludeTags: ['<string>'],
      formats: ['markdown'],
      headers: {},
      includeTags: ['<string>'],
      location: {country: 'US', languages: ['en-US']},
      lockdown: false,
      maxAge: 172800000,
      minAge: 123,
      mobile: false,
      onlyCleanContent: false,
      onlyMainContent: true,
      parsers: ['pdf'],
      proxy: 'auto',
      redactPII: false,
      removeBase64Images: true,
      skipTlsVerification: true,
      storeInCache: true,
      threatProtection: {
        blacklist: ['<string>'],
        blockedTlds: ['<string>'],
        riskScoreThreshold: 75,
        whitelist: ['<string>']
      },
      timeout: 60000,
      waitFor: 0
    },
    showSources: false,
    threatProtection: {
      blacklist: ['<string>'],
      blockedTlds: ['<string>'],
      riskScoreThreshold: 75,
      whitelist: ['<string>']
    }
  })
};

fetch('https://api.firecrawl.dev/v2/extract', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.firecrawl.dev/v2/extract",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'urls' => [
        '<string>'
    ],
    'enableWebSearch' => false,
    'ignoreInvalidURLs' => true,
    'ignoreSitemap' => false,
    'includeSubdomains' => true,
    'prompt' => '<string>',
    'schema' => [
        
    ],
    'scrapeOptions' => [
        'actions' => [
                [
                                'milliseconds' => 2,
                                'type' => 'wait'
                ]
        ],
        'blockAds' => true,
        'excludeTags' => [
                '<string>'
        ],
        'formats' => [
                'markdown'
        ],
        'headers' => [
                
        ],
        'includeTags' => [
                '<string>'
        ],
        'location' => [
                'country' => 'US',
                'languages' => [
                                'en-US'
                ]
        ],
        'lockdown' => false,
        'maxAge' => 172800000,
        'minAge' => 123,
        'mobile' => false,
        'onlyCleanContent' => false,
        'onlyMainContent' => true,
        'parsers' => [
                'pdf'
        ],
        'proxy' => 'auto',
        'redactPII' => false,
        'removeBase64Images' => true,
        'skipTlsVerification' => true,
        'storeInCache' => true,
        'threatProtection' => [
                'blacklist' => [
                                '<string>'
                ],
                'blockedTlds' => [
                                '<string>'
                ],
                'riskScoreThreshold' => 75,
                'whitelist' => [
                                '<string>'
                ]
        ],
        'timeout' => 60000,
        'waitFor' => 0
    ],
    'showSources' => false,
    'threatProtection' => [
        'blacklist' => [
                '<string>'
        ],
        'blockedTlds' => [
                '<string>'
        ],
        'riskScoreThreshold' => 75,
        'whitelist' => [
                '<string>'
        ]
    ]
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>",
    "Content-Type: application/json"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://api.firecrawl.dev/v2/extract"

	payload := strings.NewReader("{\n  \"urls\": [\n    \"<string>\"\n  ],\n  \"enableWebSearch\": false,\n  \"ignoreInvalidURLs\": true,\n  \"ignoreSitemap\": false,\n  \"includeSubdomains\": true,\n  \"prompt\": \"<string>\",\n  \"schema\": {},\n  \"scrapeOptions\": {\n    \"actions\": [\n      {\n        \"milliseconds\": 2,\n        \"type\": \"wait\"\n      }\n    ],\n    \"blockAds\": true,\n    \"excludeTags\": [\n      \"<string>\"\n    ],\n    \"formats\": [\n      \"markdown\"\n    ],\n    \"headers\": {},\n    \"includeTags\": [\n      \"<string>\"\n    ],\n    \"location\": {\n      \"country\": \"US\",\n      \"languages\": [\n        \"en-US\"\n      ]\n    },\n    \"lockdown\": false,\n    \"maxAge\": 172800000,\n    \"minAge\": 123,\n    \"mobile\": false,\n    \"onlyCleanContent\": false,\n    \"onlyMainContent\": true,\n    \"parsers\": [\n      \"pdf\"\n    ],\n    \"proxy\": \"auto\",\n    \"redactPII\": false,\n    \"removeBase64Images\": true,\n    \"skipTlsVerification\": true,\n    \"storeInCache\": true,\n    \"threatProtection\": {\n      \"blacklist\": [\n        \"<string>\"\n      ],\n      \"blockedTlds\": [\n        \"<string>\"\n      ],\n      \"riskScoreThreshold\": 75,\n      \"whitelist\": [\n        \"<string>\"\n      ]\n    },\n    \"timeout\": 60000,\n    \"waitFor\": 0\n  },\n  \"showSources\": false,\n  \"threatProtection\": {\n    \"blacklist\": [\n      \"<string>\"\n    ],\n    \"blockedTlds\": [\n      \"<string>\"\n    ],\n    \"riskScoreThreshold\": 75,\n    \"whitelist\": [\n      \"<string>\"\n    ]\n  }\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Authorization", "Bearer <token>")
	req.Header.Add("Content-Type", "application/json")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://api.firecrawl.dev/v2/extract")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"urls\": [\n    \"<string>\"\n  ],\n  \"enableWebSearch\": false,\n  \"ignoreInvalidURLs\": true,\n  \"ignoreSitemap\": false,\n  \"includeSubdomains\": true,\n  \"prompt\": \"<string>\",\n  \"schema\": {},\n  \"scrapeOptions\": {\n    \"actions\": [\n      {\n        \"milliseconds\": 2,\n        \"type\": \"wait\"\n      }\n    ],\n    \"blockAds\": true,\n    \"excludeTags\": [\n      \"<string>\"\n    ],\n    \"formats\": [\n      \"markdown\"\n    ],\n    \"headers\": {},\n    \"includeTags\": [\n      \"<string>\"\n    ],\n    \"location\": {\n      \"country\": \"US\",\n      \"languages\": [\n        \"en-US\"\n      ]\n    },\n    \"lockdown\": false,\n    \"maxAge\": 172800000,\n    \"minAge\": 123,\n    \"mobile\": false,\n    \"onlyCleanContent\": false,\n    \"onlyMainContent\": true,\n    \"parsers\": [\n      \"pdf\"\n    ],\n    \"proxy\": \"auto\",\n    \"redactPII\": false,\n    \"removeBase64Images\": true,\n    \"skipTlsVerification\": true,\n    \"storeInCache\": true,\n    \"threatProtection\": {\n      \"blacklist\": [\n        \"<string>\"\n      ],\n      \"blockedTlds\": [\n        \"<string>\"\n      ],\n      \"riskScoreThreshold\": 75,\n      \"whitelist\": [\n        \"<string>\"\n      ]\n    },\n    \"timeout\": 60000,\n    \"waitFor\": 0\n  },\n  \"showSources\": false,\n  \"threatProtection\": {\n    \"blacklist\": [\n      \"<string>\"\n    ],\n    \"blockedTlds\": [\n      \"<string>\"\n    ],\n    \"riskScoreThreshold\": 75,\n    \"whitelist\": [\n      \"<string>\"\n    ]\n  }\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://api.firecrawl.dev/v2/extract")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Authorization"] = 'Bearer <token>'
request["Content-Type"] = 'application/json'
request.body = "{\n  \"urls\": [\n    \"<string>\"\n  ],\n  \"enableWebSearch\": false,\n  \"ignoreInvalidURLs\": true,\n  \"ignoreSitemap\": false,\n  \"includeSubdomains\": true,\n  \"prompt\": \"<string>\",\n  \"schema\": {},\n  \"scrapeOptions\": {\n    \"actions\": [\n      {\n        \"milliseconds\": 2,\n        \"type\": \"wait\"\n      }\n    ],\n    \"blockAds\": true,\n    \"excludeTags\": [\n      \"<string>\"\n    ],\n    \"formats\": [\n      \"markdown\"\n    ],\n    \"headers\": {},\n    \"includeTags\": [\n      \"<string>\"\n    ],\n    \"location\": {\n      \"country\": \"US\",\n      \"languages\": [\n        \"en-US\"\n      ]\n    },\n    \"lockdown\": false,\n    \"maxAge\": 172800000,\n    \"minAge\": 123,\n    \"mobile\": false,\n    \"onlyCleanContent\": false,\n    \"onlyMainContent\": true,\n    \"parsers\": [\n      \"pdf\"\n    ],\n    \"proxy\": \"auto\",\n    \"redactPII\": false,\n    \"removeBase64Images\": true,\n    \"skipTlsVerification\": true,\n    \"storeInCache\": true,\n    \"threatProtection\": {\n      \"blacklist\": [\n        \"<string>\"\n      ],\n      \"blockedTlds\": [\n        \"<string>\"\n      ],\n      \"riskScoreThreshold\": 75,\n      \"whitelist\": [\n        \"<string>\"\n      ]\n    },\n    \"timeout\": 60000,\n    \"waitFor\": 0\n  },\n  \"showSources\": false,\n  \"threatProtection\": {\n    \"blacklist\": [\n      \"<string>\"\n    ],\n    \"blockedTlds\": [\n      \"<string>\"\n    ],\n    \"riskScoreThreshold\": 75,\n    \"whitelist\": [\n      \"<string>\"\n    ]\n  }\n}"

response = http.request(request)
puts response.read_body

{
  "id": "<string>",
  "invalidURLs": [
    "<string>"
  ],
  "success": true
}

{
  "error": "Invalid input data."
}

{
  "error": "An unexpected error occurred on the server."
}

¿Eres un agente de IA que necesita una clave de API de Firecrawl? Consulta firecrawl.dev/agent-onboarding/SKILL.md para ver las instrucciones de incorporación automática.

Autorizaciones

Authorization

string

header

requerido

Bearer authentication header of the form Bearer <token>, where <token> is your auth token.

Cuerpo

application/json

urls

string<uri>[]

requerido

Las URL de las que se extraerán datos. Deben estar en formato glob.

enableWebSearch

boolean

predeterminado:false

Si es true, la extracción utilizará la búsqueda web para encontrar datos adicionales

ignoreInvalidURLs

boolean

predeterminado:true

Si se especifican URLs no válidas en el array urls, se ignorarán. En lugar de provocar el fallo de toda la solicitud, se realizará una extracción con las URLs válidas restantes, y las URLs no válidas se devolverán en el campo invalidURLs de la respuesta.

ignoreSitemap

boolean

predeterminado:false

Si es true, se ignorarán los archivos sitemap.xml durante el escaneo del sitio web

includeSubdomains

boolean

predeterminado:true

Si se establece en true, también se escanearán los subdominios de las URL proporcionadas

prompt

string

Prompt para orientar el proceso de extracción

schema

object

Esquema que define la estructura de los datos extraídos. Debe cumplir con JSON Schema.

scrapeOptions

object

Show child attributes

showSources

boolean

predeterminado:false

Cuando se establece en true, las fuentes utilizadas para extraer los datos se incluirán en la respuesta bajo la clave sources.

threatProtection

Threat Protection Override · object

Anulación por solicitud de Protección contra amenazas. Los campos que proporciones reemplazan los campos correspondientes de la política de tu organización solo para esta solicitud; los campos omitidos conservan sus valores a nivel de organización. Requiere que Protección contra amenazas esté habilitada para tu equipo (función enterprise); de lo contrario, la solicitud se rechaza con un 403. Si tu organización ha deshabilitado las anulaciones por solicitud, cualquier solicitud que incluya este objeto se rechaza con un 403. Si Protección contra amenazas se aplica de forma obligatoria a tu equipo, mode no puede establecerse en off.

Show child attributes

Respuesta

Extracción exitosa

string

invalidURLs

string[] | null

Si ignoreInvalidURLs es true, este será un array que contendrá las URL no válidas que se especificaron en la solicitud. Si no hubo URL no válidas, será un array vacío. Si ignoreInvalidURLs es false, este campo será undefined.

success

boolean

Uso de la API

Endpoints de búsqueda

Endpoints de scraping

Endpoints de interacción

Endpoints de Research Index

Endpoints de mapeo

Endpoints de procesamiento

Endpoints de rastreo

Endpoints de supervisión

Endpoints de feedback

Endpoints de depuración con agentes

Endpoints de la cuenta

Cargas útiles de webhook

Integración de socios

Autorizaciones

Cuerpo

Respuesta