Tutorial: Export Crawl Data
This tutorial covers how to export crawl data from Lumar, including report downloads for bulk export and pagination for programmatic access.
Option 1: Report downloads (recommended for bulk export)
Report downloads generate a downloadable file (CSV or other formats) containing the full dataset for a report. This is the most efficient way to export large amounts of data.
Step 1: Create a report download
Use the createReportDownload mutation to request a file. You can specify which metrics (columns) to include and apply filters.
- Mutation
- Variables
- Response
- cURL
mutation CreateReportDownload($input: CreateReportDownloadInput!) {
createReportDownload(input: $input) {
reportDownload {
...ReportDownloadDetails
}
}
}
fragment ReportDownloadDetails on ReportDownload {
id
status
outputType
# ...other fields you want to retrieve
}
{
"input": {
"crawlId": "TjAwNUNyYXdsMTc2NjI0MQ",
"reportTemplateCode": "all_pages",
"reportTypeCode": "Basic",
"selectedMetrics": ["url", "httpStatusCode"],
"fileName": "url-error-status-codes",
"filter": { "httpStatusCode": { "ge": 500 } }
}
}
{
"data": {
"createReportDownload": {
"reportDownload": {
"id": "TjAxNFJlcG9ydERvd25sb2FkNzU4Njg3",
"status": "Generating",
"outputType": "CsvZip"
}
}
}
}
curl -X POST -H "Content-Type: application/json" -H "apollographql-client-name: docs-example-client" -H "apollographql-client-version: 1.0.0" -H "x-auth-token: YOUR_API_SESSION_TOKEN" --data '{"query":"mutation CreateReportDownload($input: CreateReportDownloadInput!) { createReportDownload(input: $input) { reportDownload { ...ReportDownloadDetails } } } fragment ReportDownloadDetails on ReportDownload { id status outputType # ...other fields you want to retrieve }","variables":{"input":{"crawlId":"TjAwNUNyYXdsMTc2NjI0MQ","reportTemplateCode":"all_pages","reportTypeCode":"Basic","selectedMetrics":["url","httpStatusCode"],"fileName":"url-error-status-codes","filter":{"httpStatusCode":{"ge":500}}}}}' https://api.lumar.io/graphql
Step 2: Poll for download completion
The report download starts with a Generating status. Poll until it reaches Completed, then use the fileUrl to download the file.
- Query
- Variables
- Response
- cURL
query GetReportDownloadStatus($reportDownloadId: ObjectID!) {
node(id: $reportDownloadId) {
... on ReportDownload {
id
status
outputType
fileURL
createdAt
}
}
}
{
"reportDownloadId": "TjAxNFJlcG9ydERvd25sb2FkNzU4Njg3"
}
{
"data": {
"node": {
"id": "TjAxNFJlcG9ydERvd25sb2FkNzU4Njg3",
"status": "Completed",
"outputType": "CsvZip",
"fileURL": "https://storage.example.com/reports/download.csv.zip",
"createdAt": "2025-01-15T10:00:00.000Z"
}
}
}
curl -X POST -H "Content-Type: application/json" -H "apollographql-client-name: docs-example-client" -H "apollographql-client-version: 1.0.0" -H "x-auth-token: YOUR_API_SESSION_TOKEN" --data '{"query":"query GetReportDownloadStatus($reportDownloadId: ObjectID!) { node(id: $reportDownloadId) { ... on ReportDownload { id status outputType fileURL createdAt } } }","variables":{"reportDownloadId":"TjAxNFJlcG9ydERvd25sb2FkNzU4Njg3"}}' https://api.lumar.io/graphql
async function waitForDownload(reportDownloadId: string): Promise<string> {
while (true) {
const result = await executeQuery(STATUS_QUERY, { reportDownloadId });
const download = result.data.node;
if (download.status === "Completed") {
return download.fileUrl;
}
if (download.status === "Failed") {
throw new Error("Report download failed");
}
console.log(`Status: ${download.status}. Checking again in 10s...`);
await new Promise(resolve => setTimeout(resolve, 10000));
}
}
Step 3: Download the file
The fileUrl is a signed URL that you can download using any HTTP client:
curl -o report.csv.zip "SIGNED_FILE_URL_HERE"
Option 2: Paginated API queries
For smaller datasets or when you need real-time access, paginate through the API directly.
- Query
- Variables
- Response
- cURL
query ExportCrawlUrls($crawlId: ObjectID!, $cursor: String) {
getReportStat(
input: { crawlId: $crawlId, reportTemplateCode: "all_pages" }
) {
crawlUrls(first: 500, after: $cursor) {
pageInfo {
hasNextPage
endCursor
}
nodes {
url
httpStatusCode
pageTitle
wordCount
fetchTime
}
totalCount
}
}
}
{
"crawlId": "TjAwNUNyYXdsMTU4MzI0NQ",
"cursor": null
}
{
"data": {
"getReportStat": {
"crawlUrls": {
"pageInfo": {
"hasNextPage": true,
"endCursor": "NTAw"
},
"nodes": [
{
"url": "https://www.example.com/",
"httpStatusCode": 200,
"pageTitle": "Home - Example",
"wordCount": 1250,
"fetchTime": 0.42
}
],
"totalCount": 2186
}
}
}
}
curl -X POST -H "Content-Type: application/json" -H "apollographql-client-name: docs-example-client" -H "apollographql-client-version: 1.0.0" -H "x-auth-token: YOUR_API_SESSION_TOKEN" --data '{"query":"query ExportCrawlUrls($crawlId: ObjectID!, $cursor: String) { getReportStat( input: { crawlId: $crawlId, reportTemplateCode: \"all_pages\" } ) { crawlUrls(first: 500, after: $cursor) { pageInfo { hasNextPage endCursor } nodes { url httpStatusCode pageTitle wordCount fetchTime } totalCount } } }","variables":{"crawlId":"TjAwNUNyYXdsMTU4MzI0NQ","cursor":null}}' https://api.lumar.io/graphql
Pagination loop
async function exportAllUrls(crawlId: string): Promise<any[]> {
const allUrls: any[] = [];
let cursor: string | null = null;
let hasNextPage = true;
while (hasNextPage) {
const result = await executeQuery(EXPORT_QUERY, { crawlId, cursor });
const connection = result.data.getReportStat.crawlUrls;
allUrls.push(...connection.nodes);
hasNextPage = connection.pageInfo.hasNextPage;
cursor = connection.pageInfo.endCursor;
console.log(`Fetched ${allUrls.length} / ${connection.totalCount} URLs`);
}
return allUrls;
}
Tips for large datasets
- Use report downloads for datasets over 10,000 URLs. Paginating through tens of thousands of records via the API is slow and consumes your rate limit budget.
- Select only the metrics you need in
selectedMetricsto reduce file size. - Apply filters to limit the export to relevant URLs (e.g., only broken pages or a specific segment).
- Use
first: 500as a reasonable page size when paginating via the API. Larger page sizes increase response time.
Next steps
- Generate Report Downloads -- detailed reference for the report download workflow.
- Filtering -- apply filters to narrow down exported data.
- Pagination -- full cursor-based pagination reference.