440 lines
10 KiB
JSON
440 lines
10 KiB
JSON
{
|
||
"id": "xEij0kj2I1DHbL3I",
|
||
"meta": {
|
||
"instanceId": "31e69f7f4a77bf465b805824e303232f0227212ae922d12133a0f96ffeab4fef",
|
||
"templateCredsSetupCompleted": true
|
||
},
|
||
"name": "💡🌐 Essential Multipage Website Scraper with Jina.ai",
|
||
"tags": [],
|
||
"nodes": [
|
||
{
|
||
"id": "3a503859-ef0a-492d-81c6-37e4f0c4c25e",
|
||
"name": "Sticky Note",
|
||
"type": "n8n-nodes-base.stickyNote",
|
||
"position": [
|
||
-840,
|
||
0
|
||
],
|
||
"parameters": {
|
||
"color": 3,
|
||
"width": 340,
|
||
"height": 320,
|
||
"content": "## Jina.ai Web Scraper\n### No API Key Required\n"
|
||
},
|
||
"typeVersion": 1
|
||
},
|
||
{
|
||
"id": "c5217a1a-f074-409b-8340-72afdc5fc8b5",
|
||
"name": "When clicking ‘Test workflow’",
|
||
"type": "n8n-nodes-base.manualTrigger",
|
||
"position": [
|
||
-1500,
|
||
-300
|
||
],
|
||
"parameters": {},
|
||
"typeVersion": 1
|
||
},
|
||
{
|
||
"id": "72af3b00-2632-4877-a0b6-7477e2f468f7",
|
||
"name": "Loop Over Items",
|
||
"type": "n8n-nodes-base.splitInBatches",
|
||
"position": [
|
||
-1080,
|
||
20
|
||
],
|
||
"parameters": {
|
||
"options": {}
|
||
},
|
||
"typeVersion": 3
|
||
},
|
||
{
|
||
"id": "11f0fa02-51f8-41cc-b789-5c452b6899aa",
|
||
"name": "Wait",
|
||
"type": "n8n-nodes-base.wait",
|
||
"position": [
|
||
80,
|
||
220
|
||
],
|
||
"webhookId": "081ce124-0cbf-4a21-a1e7-2c465f460448",
|
||
"parameters": {},
|
||
"typeVersion": 1.1
|
||
},
|
||
{
|
||
"id": "cf3b5887-8ff2-46e0-ab33-384ab0987cbb",
|
||
"name": "Limit",
|
||
"type": "n8n-nodes-base.limit",
|
||
"position": [
|
||
80,
|
||
-300
|
||
],
|
||
"parameters": {
|
||
"maxItems": 20
|
||
},
|
||
"typeVersion": 1
|
||
},
|
||
{
|
||
"id": "c4f04d82-aa33-46cf-a8e2-0b4e717e754a",
|
||
"name": "Get List of Website URLs",
|
||
"type": "n8n-nodes-base.httpRequest",
|
||
"position": [
|
||
-780,
|
||
-300
|
||
],
|
||
"parameters": {
|
||
"url": "={{ $json.sitemap_url }}",
|
||
"options": {}
|
||
},
|
||
"typeVersion": 4.2
|
||
},
|
||
{
|
||
"id": "7f507c38-1e9e-4c46-8dea-bd6daf65dc55",
|
||
"name": "Convert to JSON",
|
||
"type": "n8n-nodes-base.xml",
|
||
"position": [
|
||
-560,
|
||
-300
|
||
],
|
||
"parameters": {
|
||
"options": {}
|
||
},
|
||
"typeVersion": 1
|
||
},
|
||
{
|
||
"id": "e21b55c2-8b0d-4c7c-ba91-a2d563a4c966",
|
||
"name": "Create List of Website URLs",
|
||
"type": "n8n-nodes-base.splitOut",
|
||
"position": [
|
||
-340,
|
||
-300
|
||
],
|
||
"parameters": {
|
||
"options": {},
|
||
"fieldToSplitOut": "urlset.url"
|
||
},
|
||
"typeVersion": 1
|
||
},
|
||
{
|
||
"id": "61555239-8a16-424e-8a60-700f6ebaa270",
|
||
"name": "Filter By Topics or Pages",
|
||
"type": "n8n-nodes-base.filter",
|
||
"position": [
|
||
-120,
|
||
-300
|
||
],
|
||
"parameters": {
|
||
"options": {},
|
||
"conditions": {
|
||
"options": {
|
||
"version": 2,
|
||
"leftValue": "",
|
||
"caseSensitive": true,
|
||
"typeValidation": "strict"
|
||
},
|
||
"combinator": "or",
|
||
"conditions": [
|
||
{
|
||
"id": "d66c304d-879a-4dc4-908f-ab0665093672",
|
||
"operator": {
|
||
"name": "filter.operator.equals",
|
||
"type": "string",
|
||
"operation": "equals"
|
||
},
|
||
"leftValue": "={{ $json.loc }}",
|
||
"rightValue": "=https://ai.pydantic.dev/"
|
||
},
|
||
{
|
||
"id": "3c930950-bee4-442b-82e6-4437fd39a933",
|
||
"operator": {
|
||
"type": "string",
|
||
"operation": "contains"
|
||
},
|
||
"leftValue": "={{ $json.loc.toLowerCase() }}",
|
||
"rightValue": "agent"
|
||
},
|
||
{
|
||
"id": "aaeaf34e-ad5a-4673-b3bd-8bddf3500988",
|
||
"operator": {
|
||
"type": "string",
|
||
"operation": "contains"
|
||
},
|
||
"leftValue": "={{ $json.loc.toLowerCase() }}",
|
||
"rightValue": "tool"
|
||
}
|
||
]
|
||
}
|
||
},
|
||
"typeVersion": 2.2
|
||
},
|
||
{
|
||
"id": "dd25fb57-64a3-4c47-be04-6eb66d16520a",
|
||
"name": "Set Website URL",
|
||
"type": "n8n-nodes-base.set",
|
||
"position": [
|
||
-1080,
|
||
-300
|
||
],
|
||
"parameters": {
|
||
"options": {},
|
||
"assignments": {
|
||
"assignments": [
|
||
{
|
||
"id": "1601dc3e-8024-4e19-b592-93a4e4f77641",
|
||
"name": "sitemap_url",
|
||
"type": "string",
|
||
"value": "https://ai.pydantic.dev/sitemap.xml"
|
||
}
|
||
]
|
||
}
|
||
},
|
||
"typeVersion": 3.4
|
||
},
|
||
{
|
||
"id": "14ac1c87-29fe-44c8-9c1e-f247a292dde5",
|
||
"name": "Jina.ai Web Scraper",
|
||
"type": "n8n-nodes-base.httpRequest",
|
||
"position": [
|
||
-720,
|
||
120
|
||
],
|
||
"parameters": {
|
||
"url": "=https://r.jina.ai/{{ $json.loc }}",
|
||
"options": {}
|
||
},
|
||
"typeVersion": 4.2
|
||
},
|
||
{
|
||
"id": "be253ec2-f088-4895-8ef2-61a3720cf68b",
|
||
"name": "Save Webpage Contents to Google Drive",
|
||
"type": "n8n-nodes-base.googleDrive",
|
||
"position": [
|
||
-120,
|
||
120
|
||
],
|
||
"parameters": {
|
||
"name": "={{ $('Loop Over Items').item.json.loc }} - {{ $json.title }}",
|
||
"content": "={{ $json.markdown }}",
|
||
"driveId": {
|
||
"__rl": true,
|
||
"mode": "list",
|
||
"value": "My Drive"
|
||
},
|
||
"options": {},
|
||
"folderId": {
|
||
"__rl": true,
|
||
"mode": "list",
|
||
"value": "root",
|
||
"cachedResultName": "/ (Root folder)"
|
||
},
|
||
"operation": "createFromText"
|
||
},
|
||
"credentials": {
|
||
"googleDriveOAuth2Api": {
|
||
"id": "UhdXGYLTAJbsa0xX",
|
||
"name": "Google Drive account"
|
||
}
|
||
},
|
||
"typeVersion": 3
|
||
},
|
||
{
|
||
"id": "95d808c7-a3ca-4f59-a385-cc77bdff322e",
|
||
"name": "Extract Title & Markdown Content",
|
||
"type": "n8n-nodes-base.code",
|
||
"position": [
|
||
-380,
|
||
120
|
||
],
|
||
"parameters": {
|
||
"jsCode": "// Get the text output from the previous node\nconst data = $input.first().json.data;\n\n// Regular expression to capture the title line\nconst titleRegex = /^Title:\\s*(.+)$/m;\n// Regular expression to capture everything after \"Markdown Content:\"\nconst markdownRegex = /Markdown Content:\\n([\\s\\S]+)/;\n\n// Extract the title using the first capture group\nconst titleMatch = data.match(titleRegex);\nconst title = titleMatch ? titleMatch[1].trim() : '';\n\n// Extract the markdown content using the first capture group\nconst markdownMatch = data.match(markdownRegex);\nconst markdown = markdownMatch ? markdownMatch[1].trim() : '';\n\n// Return a single object with title and markdown as unique values\nreturn { title, markdown };"
|
||
},
|
||
"typeVersion": 2
|
||
},
|
||
{
|
||
"id": "2fb86c81-c144-4450-908c-559855deadef",
|
||
"name": "Sticky Note1",
|
||
"type": "n8n-nodes-base.stickyNote",
|
||
"position": [
|
||
-1240,
|
||
-580
|
||
],
|
||
"parameters": {
|
||
"color": 7,
|
||
"width": 1540,
|
||
"height": 1080,
|
||
"content": "# 💡🌐 Essential Multipage Website Scraper with Jina.ai\n## Scrape entire websites with this workflow\n**Use responsibly and follow local rules and regulations**"
|
||
},
|
||
"typeVersion": 1
|
||
},
|
||
{
|
||
"id": "b470b294-95d0-4e51-a9cc-2fe17316a771",
|
||
"name": "Sticky Note2",
|
||
"type": "n8n-nodes-base.stickyNote",
|
||
"position": [
|
||
-1580,
|
||
-400
|
||
],
|
||
"parameters": {
|
||
"color": 4,
|
||
"width": 280,
|
||
"height": 300,
|
||
"content": "## 👍Try Me!"
|
||
},
|
||
"typeVersion": 1
|
||
},
|
||
{
|
||
"id": "fafd0623-a423-4e73-9609-cee8e81f5c13",
|
||
"name": "Sticky Note3",
|
||
"type": "n8n-nodes-base.stickyNote",
|
||
"position": [
|
||
-1180,
|
||
-400
|
||
],
|
||
"parameters": {
|
||
"width": 300,
|
||
"height": 300,
|
||
"content": "## 👇Add Website Sitemap URL"
|
||
},
|
||
"typeVersion": 1
|
||
}
|
||
],
|
||
"active": false,
|
||
"pinData": {},
|
||
"settings": {
|
||
"executionOrder": "v1"
|
||
},
|
||
"versionId": "2e815787-d83b-4ab7-a959-2f33006a37a5",
|
||
"connections": {
|
||
"Wait": {
|
||
"main": [
|
||
[
|
||
{
|
||
"node": "Loop Over Items",
|
||
"type": "main",
|
||
"index": 0
|
||
}
|
||
]
|
||
]
|
||
},
|
||
"Limit": {
|
||
"main": [
|
||
[
|
||
{
|
||
"node": "Loop Over Items",
|
||
"type": "main",
|
||
"index": 0
|
||
}
|
||
]
|
||
]
|
||
},
|
||
"Convert to JSON": {
|
||
"main": [
|
||
[
|
||
{
|
||
"node": "Create List of Website URLs",
|
||
"type": "main",
|
||
"index": 0
|
||
}
|
||
]
|
||
]
|
||
},
|
||
"Loop Over Items": {
|
||
"main": [
|
||
[],
|
||
[
|
||
{
|
||
"node": "Jina.ai Web Scraper",
|
||
"type": "main",
|
||
"index": 0
|
||
}
|
||
]
|
||
]
|
||
},
|
||
"Set Website URL": {
|
||
"main": [
|
||
[
|
||
{
|
||
"node": "Get List of Website URLs",
|
||
"type": "main",
|
||
"index": 0
|
||
}
|
||
]
|
||
]
|
||
},
|
||
"Jina.ai Web Scraper": {
|
||
"main": [
|
||
[
|
||
{
|
||
"node": "Extract Title & Markdown Content",
|
||
"type": "main",
|
||
"index": 0
|
||
}
|
||
]
|
||
]
|
||
},
|
||
"Get List of Website URLs": {
|
||
"main": [
|
||
[
|
||
{
|
||
"node": "Convert to JSON",
|
||
"type": "main",
|
||
"index": 0
|
||
}
|
||
]
|
||
]
|
||
},
|
||
"Filter By Topics or Pages": {
|
||
"main": [
|
||
[
|
||
{
|
||
"node": "Limit",
|
||
"type": "main",
|
||
"index": 0
|
||
}
|
||
]
|
||
]
|
||
},
|
||
"Create List of Website URLs": {
|
||
"main": [
|
||
[
|
||
{
|
||
"node": "Filter By Topics or Pages",
|
||
"type": "main",
|
||
"index": 0
|
||
}
|
||
]
|
||
]
|
||
},
|
||
"Extract Title & Markdown Content": {
|
||
"main": [
|
||
[
|
||
{
|
||
"node": "Save Webpage Contents to Google Drive",
|
||
"type": "main",
|
||
"index": 0
|
||
}
|
||
]
|
||
]
|
||
},
|
||
"When clicking ‘Test workflow’": {
|
||
"main": [
|
||
[
|
||
{
|
||
"node": "Set Website URL",
|
||
"type": "main",
|
||
"index": 0
|
||
}
|
||
]
|
||
]
|
||
},
|
||
"Save Webpage Contents to Google Drive": {
|
||
"main": [
|
||
[
|
||
{
|
||
"node": "Wait",
|
||
"type": "main",
|
||
"index": 0
|
||
}
|
||
]
|
||
]
|
||
}
|
||
}
|
||
} |