Extract

Extrae datos estructurados de páginas web usando LLMs

curl --request POST \
  --url https://api.firecrawl.dev/v1/extract \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "urls": [
    "<string>"
  ],
  "enableWebSearch": false,
  "ignoreInvalidURLs": false,
  "ignoreSitemap": false,
  "includeSubdomains": true,
  "prompt": "<string>",
  "schema": {},
  "scrapeOptions": {
    "actions": [
      {
        "type": "wait",
        "milliseconds": 2,
        "selector": "#my-element"
      }
    ],
    "blockAds": true,
    "excludeTags": [
      "<string>"
    ],
    "headers": {},
    "includeTags": [
      "<string>"
    ],
    "jsonOptions": {
      "prompt": "<string>",
      "schema": {},
      "systemPrompt": "<string>"
    },
    "location": {
      "country": "US",
      "languages": [
        "en-US"
      ]
    },
    "maxAge": 0,
    "mobile": false,
    "onlyMainContent": true,
    "parsePDF": true,
    "removeBase64Images": true,
    "skipTlsVerification": false,
    "storeInCache": true,
    "threatProtection": {
      "blacklist": [
        "<string>"
      ],
      "blockedTlds": [
        "<string>"
      ],
      "riskScoreThreshold": 75,
      "whitelist": [
        "<string>"
      ]
    },
    "timeout": 30000,
    "waitFor": 0,
    "changeTrackingOptions": {
      "modes": [],
      "prompt": "<string>",
      "schema": {},
      "tag": null
    },
    "formats": [
      "markdown"
    ]
  },
  "showSources": false,
  "threatProtection": {
    "blacklist": [
      "<string>"
    ],
    "blockedTlds": [
      "<string>"
    ],
    "riskScoreThreshold": 75,
    "whitelist": [
      "<string>"
    ]
  }
}
'

import requests

url = "https://api.firecrawl.dev/v1/extract"

payload = {
    "urls": ["<string>"],
    "enableWebSearch": False,
    "ignoreInvalidURLs": False,
    "ignoreSitemap": False,
    "includeSubdomains": True,
    "prompt": "<string>",
    "schema": {},
    "scrapeOptions": {
        "actions": [
            {
                "type": "wait",
                "milliseconds": 2,
                "selector": "#my-element"
            }
        ],
        "blockAds": True,
        "excludeTags": ["<string>"],
        "headers": {},
        "includeTags": ["<string>"],
        "jsonOptions": {
            "prompt": "<string>",
            "schema": {},
            "systemPrompt": "<string>"
        },
        "location": {
            "country": "US",
            "languages": ["en-US"]
        },
        "maxAge": 0,
        "mobile": False,
        "onlyMainContent": True,
        "parsePDF": True,
        "removeBase64Images": True,
        "skipTlsVerification": False,
        "storeInCache": True,
        "threatProtection": {
            "blacklist": ["<string>"],
            "blockedTlds": ["<string>"],
            "riskScoreThreshold": 75,
            "whitelist": ["<string>"]
        },
        "timeout": 30000,
        "waitFor": 0,
        "changeTrackingOptions": {
            "modes": [],
            "prompt": "<string>",
            "schema": {},
            "tag": None
        },
        "formats": ["markdown"]
    },
    "showSources": False,
    "threatProtection": {
        "blacklist": ["<string>"],
        "blockedTlds": ["<string>"],
        "riskScoreThreshold": 75,
        "whitelist": ["<string>"]
    }
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {Authorization: 'Bearer <token>', 'Content-Type': 'application/json'},
  body: JSON.stringify({
    urls: ['<string>'],
    enableWebSearch: false,
    ignoreInvalidURLs: false,
    ignoreSitemap: false,
    includeSubdomains: true,
    prompt: '<string>',
    schema: {},
    scrapeOptions: {
      actions: [{type: 'wait', milliseconds: 2, selector: '#my-element'}],
      blockAds: true,
      excludeTags: ['<string>'],
      headers: {},
      includeTags: ['<string>'],
      jsonOptions: {prompt: '<string>', schema: {}, systemPrompt: '<string>'},
      location: {country: 'US', languages: ['en-US']},
      maxAge: 0,
      mobile: false,
      onlyMainContent: true,
      parsePDF: true,
      removeBase64Images: true,
      skipTlsVerification: false,
      storeInCache: true,
      threatProtection: {
        blacklist: ['<string>'],
        blockedTlds: ['<string>'],
        riskScoreThreshold: 75,
        whitelist: ['<string>']
      },
      timeout: 30000,
      waitFor: 0,
      changeTrackingOptions: {modes: [], prompt: '<string>', schema: {}, tag: null},
      formats: ['markdown']
    },
    showSources: false,
    threatProtection: {
      blacklist: ['<string>'],
      blockedTlds: ['<string>'],
      riskScoreThreshold: 75,
      whitelist: ['<string>']
    }
  })
};

fetch('https://api.firecrawl.dev/v1/extract', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.firecrawl.dev/v1/extract",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'urls' => [
        '<string>'
    ],
    'enableWebSearch' => false,
    'ignoreInvalidURLs' => false,
    'ignoreSitemap' => false,
    'includeSubdomains' => true,
    'prompt' => '<string>',
    'schema' => [
        
    ],
    'scrapeOptions' => [
        'actions' => [
                [
                                'type' => 'wait',
                                'milliseconds' => 2,
                                'selector' => '#my-element'
                ]
        ],
        'blockAds' => true,
        'excludeTags' => [
                '<string>'
        ],
        'headers' => [
                
        ],
        'includeTags' => [
                '<string>'
        ],
        'jsonOptions' => [
                'prompt' => '<string>',
                'schema' => [
                                
                ],
                'systemPrompt' => '<string>'
        ],
        'location' => [
                'country' => 'US',
                'languages' => [
                                'en-US'
                ]
        ],
        'maxAge' => 0,
        'mobile' => false,
        'onlyMainContent' => true,
        'parsePDF' => true,
        'removeBase64Images' => true,
        'skipTlsVerification' => false,
        'storeInCache' => true,
        'threatProtection' => [
                'blacklist' => [
                                '<string>'
                ],
                'blockedTlds' => [
                                '<string>'
                ],
                'riskScoreThreshold' => 75,
                'whitelist' => [
                                '<string>'
                ]
        ],
        'timeout' => 30000,
        'waitFor' => 0,
        'changeTrackingOptions' => [
                'modes' => [
                                
                ],
                'prompt' => '<string>',
                'schema' => [
                                
                ],
                'tag' => null
        ],
        'formats' => [
                'markdown'
        ]
    ],
    'showSources' => false,
    'threatProtection' => [
        'blacklist' => [
                '<string>'
        ],
        'blockedTlds' => [
                '<string>'
        ],
        'riskScoreThreshold' => 75,
        'whitelist' => [
                '<string>'
        ]
    ]
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>",
    "Content-Type: application/json"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://api.firecrawl.dev/v1/extract"

	payload := strings.NewReader("{\n  \"urls\": [\n    \"<string>\"\n  ],\n  \"enableWebSearch\": false,\n  \"ignoreInvalidURLs\": false,\n  \"ignoreSitemap\": false,\n  \"includeSubdomains\": true,\n  \"prompt\": \"<string>\",\n  \"schema\": {},\n  \"scrapeOptions\": {\n    \"actions\": [\n      {\n        \"type\": \"wait\",\n        \"milliseconds\": 2,\n        \"selector\": \"#my-element\"\n      }\n    ],\n    \"blockAds\": true,\n    \"excludeTags\": [\n      \"<string>\"\n    ],\n    \"headers\": {},\n    \"includeTags\": [\n      \"<string>\"\n    ],\n    \"jsonOptions\": {\n      \"prompt\": \"<string>\",\n      \"schema\": {},\n      \"systemPrompt\": \"<string>\"\n    },\n    \"location\": {\n      \"country\": \"US\",\n      \"languages\": [\n        \"en-US\"\n      ]\n    },\n    \"maxAge\": 0,\n    \"mobile\": false,\n    \"onlyMainContent\": true,\n    \"parsePDF\": true,\n    \"removeBase64Images\": true,\n    \"skipTlsVerification\": false,\n    \"storeInCache\": true,\n    \"threatProtection\": {\n      \"blacklist\": [\n        \"<string>\"\n      ],\n      \"blockedTlds\": [\n        \"<string>\"\n      ],\n      \"riskScoreThreshold\": 75,\n      \"whitelist\": [\n        \"<string>\"\n      ]\n    },\n    \"timeout\": 30000,\n    \"waitFor\": 0,\n    \"changeTrackingOptions\": {\n      \"modes\": [],\n      \"prompt\": \"<string>\",\n      \"schema\": {},\n      \"tag\": null\n    },\n    \"formats\": [\n      \"markdown\"\n    ]\n  },\n  \"showSources\": false,\n  \"threatProtection\": {\n    \"blacklist\": [\n      \"<string>\"\n    ],\n    \"blockedTlds\": [\n      \"<string>\"\n    ],\n    \"riskScoreThreshold\": 75,\n    \"whitelist\": [\n      \"<string>\"\n    ]\n  }\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Authorization", "Bearer <token>")
	req.Header.Add("Content-Type", "application/json")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://api.firecrawl.dev/v1/extract")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"urls\": [\n    \"<string>\"\n  ],\n  \"enableWebSearch\": false,\n  \"ignoreInvalidURLs\": false,\n  \"ignoreSitemap\": false,\n  \"includeSubdomains\": true,\n  \"prompt\": \"<string>\",\n  \"schema\": {},\n  \"scrapeOptions\": {\n    \"actions\": [\n      {\n        \"type\": \"wait\",\n        \"milliseconds\": 2,\n        \"selector\": \"#my-element\"\n      }\n    ],\n    \"blockAds\": true,\n    \"excludeTags\": [\n      \"<string>\"\n    ],\n    \"headers\": {},\n    \"includeTags\": [\n      \"<string>\"\n    ],\n    \"jsonOptions\": {\n      \"prompt\": \"<string>\",\n      \"schema\": {},\n      \"systemPrompt\": \"<string>\"\n    },\n    \"location\": {\n      \"country\": \"US\",\n      \"languages\": [\n        \"en-US\"\n      ]\n    },\n    \"maxAge\": 0,\n    \"mobile\": false,\n    \"onlyMainContent\": true,\n    \"parsePDF\": true,\n    \"removeBase64Images\": true,\n    \"skipTlsVerification\": false,\n    \"storeInCache\": true,\n    \"threatProtection\": {\n      \"blacklist\": [\n        \"<string>\"\n      ],\n      \"blockedTlds\": [\n        \"<string>\"\n      ],\n      \"riskScoreThreshold\": 75,\n      \"whitelist\": [\n        \"<string>\"\n      ]\n    },\n    \"timeout\": 30000,\n    \"waitFor\": 0,\n    \"changeTrackingOptions\": {\n      \"modes\": [],\n      \"prompt\": \"<string>\",\n      \"schema\": {},\n      \"tag\": null\n    },\n    \"formats\": [\n      \"markdown\"\n    ]\n  },\n  \"showSources\": false,\n  \"threatProtection\": {\n    \"blacklist\": [\n      \"<string>\"\n    ],\n    \"blockedTlds\": [\n      \"<string>\"\n    ],\n    \"riskScoreThreshold\": 75,\n    \"whitelist\": [\n      \"<string>\"\n    ]\n  }\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://api.firecrawl.dev/v1/extract")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Authorization"] = 'Bearer <token>'
request["Content-Type"] = 'application/json'
request.body = "{\n  \"urls\": [\n    \"<string>\"\n  ],\n  \"enableWebSearch\": false,\n  \"ignoreInvalidURLs\": false,\n  \"ignoreSitemap\": false,\n  \"includeSubdomains\": true,\n  \"prompt\": \"<string>\",\n  \"schema\": {},\n  \"scrapeOptions\": {\n    \"actions\": [\n      {\n        \"type\": \"wait\",\n        \"milliseconds\": 2,\n        \"selector\": \"#my-element\"\n      }\n    ],\n    \"blockAds\": true,\n    \"excludeTags\": [\n      \"<string>\"\n    ],\n    \"headers\": {},\n    \"includeTags\": [\n      \"<string>\"\n    ],\n    \"jsonOptions\": {\n      \"prompt\": \"<string>\",\n      \"schema\": {},\n      \"systemPrompt\": \"<string>\"\n    },\n    \"location\": {\n      \"country\": \"US\",\n      \"languages\": [\n        \"en-US\"\n      ]\n    },\n    \"maxAge\": 0,\n    \"mobile\": false,\n    \"onlyMainContent\": true,\n    \"parsePDF\": true,\n    \"removeBase64Images\": true,\n    \"skipTlsVerification\": false,\n    \"storeInCache\": true,\n    \"threatProtection\": {\n      \"blacklist\": [\n        \"<string>\"\n      ],\n      \"blockedTlds\": [\n        \"<string>\"\n      ],\n      \"riskScoreThreshold\": 75,\n      \"whitelist\": [\n        \"<string>\"\n      ]\n    },\n    \"timeout\": 30000,\n    \"waitFor\": 0,\n    \"changeTrackingOptions\": {\n      \"modes\": [],\n      \"prompt\": \"<string>\",\n      \"schema\": {},\n      \"tag\": null\n    },\n    \"formats\": [\n      \"markdown\"\n    ]\n  },\n  \"showSources\": false,\n  \"threatProtection\": {\n    \"blacklist\": [\n      \"<string>\"\n    ],\n    \"blockedTlds\": [\n      \"<string>\"\n    ],\n    \"riskScoreThreshold\": 75,\n    \"whitelist\": [\n      \"<string>\"\n    ]\n  }\n}"

response = http.request(request)
puts response.read_body

{
  "id": "<string>",
  "invalidURLs": [
    "<string>"
  ],
  "success": true
}

{
  "error": "Invalid input data."
}

{
  "error": "An unexpected error occurred on the server."
}

POST

extract

Extrae datos estructurados de páginas web usando LLMs

curl --request POST \
  --url https://api.firecrawl.dev/v1/extract \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "urls": [
    "<string>"
  ],
  "enableWebSearch": false,
  "ignoreInvalidURLs": false,
  "ignoreSitemap": false,
  "includeSubdomains": true,
  "prompt": "<string>",
  "schema": {},
  "scrapeOptions": {
    "actions": [
      {
        "type": "wait",
        "milliseconds": 2,
        "selector": "#my-element"
      }
    ],
    "blockAds": true,
    "excludeTags": [
      "<string>"
    ],
    "headers": {},
    "includeTags": [
      "<string>"
    ],
    "jsonOptions": {
      "prompt": "<string>",
      "schema": {},
      "systemPrompt": "<string>"
    },
    "location": {
      "country": "US",
      "languages": [
        "en-US"
      ]
    },
    "maxAge": 0,
    "mobile": false,
    "onlyMainContent": true,
    "parsePDF": true,
    "removeBase64Images": true,
    "skipTlsVerification": false,
    "storeInCache": true,
    "threatProtection": {
      "blacklist": [
        "<string>"
      ],
      "blockedTlds": [
        "<string>"
      ],
      "riskScoreThreshold": 75,
      "whitelist": [
        "<string>"
      ]
    },
    "timeout": 30000,
    "waitFor": 0,
    "changeTrackingOptions": {
      "modes": [],
      "prompt": "<string>",
      "schema": {},
      "tag": null
    },
    "formats": [
      "markdown"
    ]
  },
  "showSources": false,
  "threatProtection": {
    "blacklist": [
      "<string>"
    ],
    "blockedTlds": [
      "<string>"
    ],
    "riskScoreThreshold": 75,
    "whitelist": [
      "<string>"
    ]
  }
}
'

import requests

url = "https://api.firecrawl.dev/v1/extract"

payload = {
    "urls": ["<string>"],
    "enableWebSearch": False,
    "ignoreInvalidURLs": False,
    "ignoreSitemap": False,
    "includeSubdomains": True,
    "prompt": "<string>",
    "schema": {},
    "scrapeOptions": {
        "actions": [
            {
                "type": "wait",
                "milliseconds": 2,
                "selector": "#my-element"
            }
        ],
        "blockAds": True,
        "excludeTags": ["<string>"],
        "headers": {},
        "includeTags": ["<string>"],
        "jsonOptions": {
            "prompt": "<string>",
            "schema": {},
            "systemPrompt": "<string>"
        },
        "location": {
            "country": "US",
            "languages": ["en-US"]
        },
        "maxAge": 0,
        "mobile": False,
        "onlyMainContent": True,
        "parsePDF": True,
        "removeBase64Images": True,
        "skipTlsVerification": False,
        "storeInCache": True,
        "threatProtection": {
            "blacklist": ["<string>"],
            "blockedTlds": ["<string>"],
            "riskScoreThreshold": 75,
            "whitelist": ["<string>"]
        },
        "timeout": 30000,
        "waitFor": 0,
        "changeTrackingOptions": {
            "modes": [],
            "prompt": "<string>",
            "schema": {},
            "tag": None
        },
        "formats": ["markdown"]
    },
    "showSources": False,
    "threatProtection": {
        "blacklist": ["<string>"],
        "blockedTlds": ["<string>"],
        "riskScoreThreshold": 75,
        "whitelist": ["<string>"]
    }
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {Authorization: 'Bearer <token>', 'Content-Type': 'application/json'},
  body: JSON.stringify({
    urls: ['<string>'],
    enableWebSearch: false,
    ignoreInvalidURLs: false,
    ignoreSitemap: false,
    includeSubdomains: true,
    prompt: '<string>',
    schema: {},
    scrapeOptions: {
      actions: [{type: 'wait', milliseconds: 2, selector: '#my-element'}],
      blockAds: true,
      excludeTags: ['<string>'],
      headers: {},
      includeTags: ['<string>'],
      jsonOptions: {prompt: '<string>', schema: {}, systemPrompt: '<string>'},
      location: {country: 'US', languages: ['en-US']},
      maxAge: 0,
      mobile: false,
      onlyMainContent: true,
      parsePDF: true,
      removeBase64Images: true,
      skipTlsVerification: false,
      storeInCache: true,
      threatProtection: {
        blacklist: ['<string>'],
        blockedTlds: ['<string>'],
        riskScoreThreshold: 75,
        whitelist: ['<string>']
      },
      timeout: 30000,
      waitFor: 0,
      changeTrackingOptions: {modes: [], prompt: '<string>', schema: {}, tag: null},
      formats: ['markdown']
    },
    showSources: false,
    threatProtection: {
      blacklist: ['<string>'],
      blockedTlds: ['<string>'],
      riskScoreThreshold: 75,
      whitelist: ['<string>']
    }
  })
};

fetch('https://api.firecrawl.dev/v1/extract', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.firecrawl.dev/v1/extract",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'urls' => [
        '<string>'
    ],
    'enableWebSearch' => false,
    'ignoreInvalidURLs' => false,
    'ignoreSitemap' => false,
    'includeSubdomains' => true,
    'prompt' => '<string>',
    'schema' => [
        
    ],
    'scrapeOptions' => [
        'actions' => [
                [
                                'type' => 'wait',
                                'milliseconds' => 2,
                                'selector' => '#my-element'
                ]
        ],
        'blockAds' => true,
        'excludeTags' => [
                '<string>'
        ],
        'headers' => [
                
        ],
        'includeTags' => [
                '<string>'
        ],
        'jsonOptions' => [
                'prompt' => '<string>',
                'schema' => [
                                
                ],
                'systemPrompt' => '<string>'
        ],
        'location' => [
                'country' => 'US',
                'languages' => [
                                'en-US'
                ]
        ],
        'maxAge' => 0,
        'mobile' => false,
        'onlyMainContent' => true,
        'parsePDF' => true,
        'removeBase64Images' => true,
        'skipTlsVerification' => false,
        'storeInCache' => true,
        'threatProtection' => [
                'blacklist' => [
                                '<string>'
                ],
                'blockedTlds' => [
                                '<string>'
                ],
                'riskScoreThreshold' => 75,
                'whitelist' => [
                                '<string>'
                ]
        ],
        'timeout' => 30000,
        'waitFor' => 0,
        'changeTrackingOptions' => [
                'modes' => [
                                
                ],
                'prompt' => '<string>',
                'schema' => [
                                
                ],
                'tag' => null
        ],
        'formats' => [
                'markdown'
        ]
    ],
    'showSources' => false,
    'threatProtection' => [
        'blacklist' => [
                '<string>'
        ],
        'blockedTlds' => [
                '<string>'
        ],
        'riskScoreThreshold' => 75,
        'whitelist' => [
                '<string>'
        ]
    ]
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>",
    "Content-Type: application/json"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://api.firecrawl.dev/v1/extract"

	payload := strings.NewReader("{\n  \"urls\": [\n    \"<string>\"\n  ],\n  \"enableWebSearch\": false,\n  \"ignoreInvalidURLs\": false,\n  \"ignoreSitemap\": false,\n  \"includeSubdomains\": true,\n  \"prompt\": \"<string>\",\n  \"schema\": {},\n  \"scrapeOptions\": {\n    \"actions\": [\n      {\n        \"type\": \"wait\",\n        \"milliseconds\": 2,\n        \"selector\": \"#my-element\"\n      }\n    ],\n    \"blockAds\": true,\n    \"excludeTags\": [\n      \"<string>\"\n    ],\n    \"headers\": {},\n    \"includeTags\": [\n      \"<string>\"\n    ],\n    \"jsonOptions\": {\n      \"prompt\": \"<string>\",\n      \"schema\": {},\n      \"systemPrompt\": \"<string>\"\n    },\n    \"location\": {\n      \"country\": \"US\",\n      \"languages\": [\n        \"en-US\"\n      ]\n    },\n    \"maxAge\": 0,\n    \"mobile\": false,\n    \"onlyMainContent\": true,\n    \"parsePDF\": true,\n    \"removeBase64Images\": true,\n    \"skipTlsVerification\": false,\n    \"storeInCache\": true,\n    \"threatProtection\": {\n      \"blacklist\": [\n        \"<string>\"\n      ],\n      \"blockedTlds\": [\n        \"<string>\"\n      ],\n      \"riskScoreThreshold\": 75,\n      \"whitelist\": [\n        \"<string>\"\n      ]\n    },\n    \"timeout\": 30000,\n    \"waitFor\": 0,\n    \"changeTrackingOptions\": {\n      \"modes\": [],\n      \"prompt\": \"<string>\",\n      \"schema\": {},\n      \"tag\": null\n    },\n    \"formats\": [\n      \"markdown\"\n    ]\n  },\n  \"showSources\": false,\n  \"threatProtection\": {\n    \"blacklist\": [\n      \"<string>\"\n    ],\n    \"blockedTlds\": [\n      \"<string>\"\n    ],\n    \"riskScoreThreshold\": 75,\n    \"whitelist\": [\n      \"<string>\"\n    ]\n  }\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Authorization", "Bearer <token>")
	req.Header.Add("Content-Type", "application/json")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://api.firecrawl.dev/v1/extract")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"urls\": [\n    \"<string>\"\n  ],\n  \"enableWebSearch\": false,\n  \"ignoreInvalidURLs\": false,\n  \"ignoreSitemap\": false,\n  \"includeSubdomains\": true,\n  \"prompt\": \"<string>\",\n  \"schema\": {},\n  \"scrapeOptions\": {\n    \"actions\": [\n      {\n        \"type\": \"wait\",\n        \"milliseconds\": 2,\n        \"selector\": \"#my-element\"\n      }\n    ],\n    \"blockAds\": true,\n    \"excludeTags\": [\n      \"<string>\"\n    ],\n    \"headers\": {},\n    \"includeTags\": [\n      \"<string>\"\n    ],\n    \"jsonOptions\": {\n      \"prompt\": \"<string>\",\n      \"schema\": {},\n      \"systemPrompt\": \"<string>\"\n    },\n    \"location\": {\n      \"country\": \"US\",\n      \"languages\": [\n        \"en-US\"\n      ]\n    },\n    \"maxAge\": 0,\n    \"mobile\": false,\n    \"onlyMainContent\": true,\n    \"parsePDF\": true,\n    \"removeBase64Images\": true,\n    \"skipTlsVerification\": false,\n    \"storeInCache\": true,\n    \"threatProtection\": {\n      \"blacklist\": [\n        \"<string>\"\n      ],\n      \"blockedTlds\": [\n        \"<string>\"\n      ],\n      \"riskScoreThreshold\": 75,\n      \"whitelist\": [\n        \"<string>\"\n      ]\n    },\n    \"timeout\": 30000,\n    \"waitFor\": 0,\n    \"changeTrackingOptions\": {\n      \"modes\": [],\n      \"prompt\": \"<string>\",\n      \"schema\": {},\n      \"tag\": null\n    },\n    \"formats\": [\n      \"markdown\"\n    ]\n  },\n  \"showSources\": false,\n  \"threatProtection\": {\n    \"blacklist\": [\n      \"<string>\"\n    ],\n    \"blockedTlds\": [\n      \"<string>\"\n    ],\n    \"riskScoreThreshold\": 75,\n    \"whitelist\": [\n      \"<string>\"\n    ]\n  }\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://api.firecrawl.dev/v1/extract")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Authorization"] = 'Bearer <token>'
request["Content-Type"] = 'application/json'
request.body = "{\n  \"urls\": [\n    \"<string>\"\n  ],\n  \"enableWebSearch\": false,\n  \"ignoreInvalidURLs\": false,\n  \"ignoreSitemap\": false,\n  \"includeSubdomains\": true,\n  \"prompt\": \"<string>\",\n  \"schema\": {},\n  \"scrapeOptions\": {\n    \"actions\": [\n      {\n        \"type\": \"wait\",\n        \"milliseconds\": 2,\n        \"selector\": \"#my-element\"\n      }\n    ],\n    \"blockAds\": true,\n    \"excludeTags\": [\n      \"<string>\"\n    ],\n    \"headers\": {},\n    \"includeTags\": [\n      \"<string>\"\n    ],\n    \"jsonOptions\": {\n      \"prompt\": \"<string>\",\n      \"schema\": {},\n      \"systemPrompt\": \"<string>\"\n    },\n    \"location\": {\n      \"country\": \"US\",\n      \"languages\": [\n        \"en-US\"\n      ]\n    },\n    \"maxAge\": 0,\n    \"mobile\": false,\n    \"onlyMainContent\": true,\n    \"parsePDF\": true,\n    \"removeBase64Images\": true,\n    \"skipTlsVerification\": false,\n    \"storeInCache\": true,\n    \"threatProtection\": {\n      \"blacklist\": [\n        \"<string>\"\n      ],\n      \"blockedTlds\": [\n        \"<string>\"\n      ],\n      \"riskScoreThreshold\": 75,\n      \"whitelist\": [\n        \"<string>\"\n      ]\n    },\n    \"timeout\": 30000,\n    \"waitFor\": 0,\n    \"changeTrackingOptions\": {\n      \"modes\": [],\n      \"prompt\": \"<string>\",\n      \"schema\": {},\n      \"tag\": null\n    },\n    \"formats\": [\n      \"markdown\"\n    ]\n  },\n  \"showSources\": false,\n  \"threatProtection\": {\n    \"blacklist\": [\n      \"<string>\"\n    ],\n    \"blockedTlds\": [\n      \"<string>\"\n    ],\n    \"riskScoreThreshold\": 75,\n    \"whitelist\": [\n      \"<string>\"\n    ]\n  }\n}"

response = http.request(request)
puts response.read_body

{
  "id": "<string>",
  "invalidURLs": [
    "<string>"
  ],
  "success": true
}

{
  "error": "Invalid input data."
}

{
  "error": "An unexpected error occurred on the server."
}

Nota: Ya está disponible una nueva versión v2 de esta API con funcionalidades y rendimiento mejorados.

Autorizaciones

Authorization

string

header

requerido

Bearer authentication header of the form Bearer <token>, where <token> is your auth token.

Cuerpo

application/json

urls

string<uri>[]

requerido

Las URL de las que se va a extraer datos. Las URL deben estar en formato glob.

enableWebSearch

boolean

predeterminado:false

Cuando está establecido en true, la extracción utilizará la búsqueda web para encontrar datos adicionales

ignoreInvalidURLs

boolean

predeterminado:false

Si se especifican URLs no válidas en el array urls, se ignorarán. En lugar de hacer que falle toda la solicitud, se realizará una extracción con las URLs válidas restantes y las URLs no válidas se devolverán en el campo invalidURLs de la respuesta.

ignoreSitemap

boolean

predeterminado:false

Si se establece en true, se ignorarán los archivos sitemap.xml durante el rastreo del sitio web

includeSubdomains

boolean

predeterminado:true

Si es true, también se escanearán los subdominios de las URL proporcionadas

prompt

string

Prompt que guía el proceso de extracción

schema

object

Esquema para definir la estructura de los datos extraídos. Debe ajustarse a JSON Schema.

scrapeOptions

object

Show child attributes

showSources

boolean

predeterminado:false

Si es true, las fuentes utilizadas para extraer los datos se incluirán en la respuesta en la clave sources.

threatProtection

Threat Protection Override · object

Anulación por solicitud de Protección contra amenazas. Los campos que proporciones reemplazan los campos correspondientes de la política de tu organización solo para esta solicitud; los campos omitidos conservan sus valores a nivel de organización. Requiere que Protección contra amenazas esté habilitada para tu equipo (función Enterprise); de lo contrario, la solicitud se rechaza con un 403. Si tu organización ha deshabilitado las anulaciones por solicitud, cualquier solicitud que incluya este objeto se rechaza con un 403. Si Protección contra amenazas se aplica de forma obligatoria para tu equipo, mode no puede establecerse en off.

Show child attributes

Respuesta

Extracción exitosa

string

invalidURLs

string[] | null

Si ignoreInvalidURLs es true, será un array que contiene las URL no válidas especificadas en la solicitud. Si no hay URL no válidas, será un array vacío. Si ignoreInvalidURLs es false, este campo será undefined.

success

boolean

Obtener estado de extracción

Uso de la API

Endpoints de Scrape

Endpoints de Crawl

Endpoints de Map

Endpoints de Search

Endpoints de Extract

Endpoints de la cuenta

Autorizaciones

Cuerpo

Respuesta