Hello everyone,
I have two questions.
I now have a working workflow using ‘Scraper’, but it does not search all businesses/places in the area of the city.
Also, it gives me many incomplete or missing emails. Is there a way to improve this?
I bought this workflow package, and it came with two other workflows – ‘Runner’ and ‘Iterator’.
I don’t really understand why they are included, because I only use ‘Scraper’.
Attached is the ‘Scraper’ workflow that gives incomplete email results.
Thanks for the help.
Scarper Code >
{
“name”: “Scarper”,
“nodes”: [
{
“parameters”: {},
“id”: “36927380-b031-470d-9f06-5d91dce70d6f”,
“name”: “Remove Duplicate URLs”,
“type”: “n8n-nodes-base.removeDuplicates”,
“typeVersion”: 1.1,
“position”: [
-940,
200
]
},
{
“parameters”: {
“url”: “=Restaurants Köln auf koeln.de{{ $json.query }}”,
“options”: {
“allowUnauthorizedCerts”: false
}
},
“id”: “d244deb5-3bb1-461c-93c2-640a936ba1f9”,
“name”: “Search Google Maps with query”,
“type”: “n8n-nodes-base.httpRequest”,
“typeVersion”: 4.2,
“position”: [
-1520,
200
],
“alwaysOutputData”: false,
“executeOnce”: false
},
{
“parameters”: {
“jsCode”: “const data = $input.first().json.data\n\nconst regex = /https?:\/\/[^\/]+/g\n\nconst urls = data.match(regex)\n\nreturn urls.map(url => ({json: {url: url}}))”
},
“id”: “14256728-482f-41f9-81ed-d2002da177b1”,
“name”: “Scrape URLs from results”,
“type”: “n8n-nodes-base.code”,
“typeVersion”: 2,
“position”: [
-1320,
200
]
},
{
“parameters”: {
“conditions”: {
“options”: {
“caseSensitive”: true,
“leftValue”: “”,
“typeValidation”: “strict”,
“version”: 2
},
“conditions”: [
{
“id”: “041797f2-2fe2-41dc-902a-d34050b9b304”,
“leftValue”: “={{ $json.url }}”,
“rightValue”: “=(google|shema\.org|example\.com|sentry-next\.wixpress\.com|imli\.com|sentry\.wixpress\.com|ingest\.sentry\.io)”,
“operator”: {
“type”: “string”,
“operation”: “notRegex”
}
},
{
“id”: “eb499a7e-17bc-453c-be08-a47286f726dd”,
“leftValue”: “”,
“rightValue”: “”,
“operator”: {
“type”: “string”,
“operation”: “equals”,
“name”: “filter.operator.equals”
}
}
],
“combinator”: “and”
},
“options”: {}
},
“id”: “ca31a986-8805-4dc8-af70-b645f0a94e98”,
“name”: “Filter irrelevant URLs”,
“type”: “n8n-nodes-base.filter”,
“typeVersion”: 2.2,
“position”: [
-1120,
200
]
},
{
“parameters”: {
“content”: “### Scraper \nThis workflow will be executed in the background for each query. Click the All executions tab in the left sidebar to see the executions running.”,
“height”: 100,
“width”: 480
},
“type”: “n8n-nodes-base.stickyNote”,
“typeVersion”: 1,
“position”: [
-1760,
40
],
“id”: “b7dbcd65-6707-4a86-b5e7-fc428fdf895f”,
“name”: “Sticky Note3”
},
{
“parameters”: {
“content”: " \n1. Setup your credentials. Video tutorial\n\n2. Choose which document and sheet to save the scraped emails to. ",
“height”: 180,
“width”: 220,
“color”: 4
},
“type”: “n8n-nodes-base.stickyNote”,
“typeVersion”: 1,
“position”: [
720,
200
],
“id”: “167ed9a1-33b1-4380-93d2-2694fddb83b3”,
“name”: “Sticky Note4”
},
{
“parameters”: {
“inputSource”: “passthrough”
},
“type”: “n8n-nodes-base.executeWorkflowTrigger”,
“typeVersion”: 1.1,
“position”: [
-1760,
200
],
“id”: “fb07a678-7631-4961-bd72-f518611bd2a2”,
“name”: “Workflow Input Trigger”
},
{
“parameters”: {
“mode”: “runOnceForEachItem”,
“jsCode”: “function extractPhoneNumbers(htmlContent) {\n // Regular expression for common phone number formats with clear formatting\n const phoneRegex = /(?:\+?1[-.]?)?\(?[0-9]{3}\)?[-.]?[0-9]{3}[-.]?[0-9]{4}/g;\n \n // Find all matches that have at least one formatting character\n const matches = (htmlContent.match(phoneRegex) || ).filter(phone => \n /[^0-9]/.test(phone) // Only keep numbers that have non-digit characters\n );\n \n // Clean and format the phone numbers\n const phoneNumbers = matches.map(phone => {\n // Remove all non-numeric characters\n const cleaned = phone.replace(/\D/g, ‘’);\n \n // Format the number based on length\n if (cleaned.length === 10) {\n return (${cleaned.slice(0,3)}) ${cleaned.slice(3,6)}-${cleaned.slice(6)}
;\n } else if (cleaned.length === 11 && cleaned.startsWith(‘1’)) {\n return +1 (${cleaned.slice(1,4)}) ${cleaned.slice(4,7)}-${cleaned.slice(7)}
;\n }\n \n return phone; // Return original if no specific format matches\n });\n \n // Remove duplicates\n return […new Set(phoneNumbers)];\n}\n\n\nconst pageHTML = $json.data;\n\n// Extract emails using regex\nconst emailRegex = /[a-zA-Z0-9._%±]+@[a-zA-Z0-9.-]+\.(?!png|jpg|gif|jpeg)[a-zA-Z]{2,}/g;\nconst emails = pageHTML.match(emailRegex);\n// Filter away duplicate emails using a Set\nconst uniqueEmails = […new Set(emails)];\n\n// Extract the title using an optimal regex pattern\nconst titleRegex = /([\s\S]*?)<\/title>/i;\nconst titleMatch = pageHTML.match(titleRegex);\nconst title = titleMatch ? titleMatch[1].trim() : null;\n\nconst phoneNumbers = extractPhoneNumbers(pageHTML);\n\n\nreturn {\n json: {\n url: $(‘Loop over URLs’).item.json.url,\n title: title,\n pageHTML: pageHTML,\n emails: uniqueEmails,\n phoneNumbers: phoneNumbers\n }\n};\n\n”
},
“type”: “n8n-nodes-base.code”,
“typeVersion”: 2,
“position”: [
0,
180
],
“id”: “c1d05c57-7277-41ab-a586-f90ed1ca3d1e”,
“name”: “Scrape HomePage”,
“onError”: “continueRegularOutput”
},
{
“parameters”: {
“compare”: “selectedFields”,
“fieldsToCompare”: “url”,
“options”: {}
},
“id”: “7c080674-c7c9-4413-87f6-cf7eeeb24e11”,
“name”: “Remove duplicate entries”,
“type”: “n8n-nodes-base.removeDuplicates”,
“typeVersion”: 1.1,
“position”: [
460,
0
]
},
{
“parameters”: {
“options”: {
“reset”: false
}
},
“id”: “332a5f3c-b599-48a8-9e73-f096022203fe”,
“name”: “Loop over URLs”,
“type”: “n8n-nodes-base.splitInBatches”,
“typeVersion”: 3,
“position”: [
-440,
20
],
“onError”: “continueErrorOutput”
},
{
“parameters”: {
“url”: “={{ $json.url }}”,
“options”: {}
},
“id”: “4cb8cfc5-7858-4c7f-80a1-2559150f281b”,
“name”: “Request web page for URL”,
“type”: “n8n-nodes-base.httpRequest”,
“typeVersion”: 4.2,
“position”: [
-220,
100
],
“alwaysOutputData”: false,
“onError”: “continueRegularOutput”
},
{
“parameters”: {
“mode”: “runOnceForEachItem”,
“jsCode”: “// Regex for emails/domains to exclude\nconst excludeRegex = /(google|gstatic|ggpht|schema\.org|example\.com|sentry\.wixpress\.com|sentry-next\.wixpress\.com|ingest\.sentry\.io|sentry\.io|imli\.com|test@|example@|\.webp|\.svg)/i;\n\n// Access emails from input\nconst emails = $input.item.json.emails || ;\n\n// Filter out unwanted emails\nconst cleanedEmails = emails.filter(email => !excludeRegex.test(email));\n\n// Return updated item\nreturn {\n json: {\n url: $input.item.json.url,\n title: $input.item.json.title,\n pageHTML: $input.item.json.pageHTML,\n phoneNumbers: $input.item.json.phoneNumbers,\n emails: cleanedEmails\n }\n};\n”
},
“type”: “n8n-nodes-base.code”,
“typeVersion”: 2,
“position”: [
200,
0
],
“id”: “deedcc78-0c58-4f5f-bf6f-5e7239fbf59d”,
“name”: “Clean Emails”
},
{
“parameters”: {
“operation”: “append”,
“documentId”: {
“__rl”: true,
“value”: “1aBpWo_sMBfUQtcQXEsZ24UWhKmiqfTvm2hYcSsPv35g”,
“mode”: “list”,
“cachedResultName”: “Vertrieb Leads”,
“cachedResultUrl”: “https://docs.google.com/spreadsheets/d/1aBpWo_sMBfUQtcQXEsZ24UWhKmiqfTvm2hYcSsPv35g/edit?usp=drivesdk”
},
“sheetName”: {
“__rl”: true,
“value”: “gid=0”,
“mode”: “list”,
“cachedResultName”: “Vertrieb Leadss”,
“cachedResultUrl”: “https://docs.google.com/spreadsheets/d/1aBpWo_sMBfUQtcQXEsZ24UWhKmiqfTvm2hYcSsPv35g/edit#gid=0”
},
“columns”: {
“mappingMode”: “defineBelow”,
“value”: {
“Spalte 1”: “={{ $json.emails }}”,
“Spalte 2”: “={{ $(‘Scrape HomePage’).item.json.url }}”,
“Spalte 3”: “={{ $(‘Scrape HomePage’).item.json.title }}”,
“Spalte 4”: “={{ $(‘Scrape HomePage’).item.json.phoneNumbers }}”
},
“matchingColumns”: ,
“schema”: [
{
“id”: “Spalte 1”,
“displayName”: “Spalte 1”,
“required”: false,
“defaultMatch”: false,
“display”: true,
“type”: “string”,
“canBeUsedToMatch”: true,
“removed”: false
},
{
“id”: “Spalte 2”,
“displayName”: “Spalte 2”,
“required”: false,
“defaultMatch”: false,
“display”: true,
“type”: “string”,
“canBeUsedToMatch”: true,
“removed”: false
},
{
“id”: “Spalte 3”,
“displayName”: “Spalte 3”,
“required”: false,
“defaultMatch”: false,
“display”: true,
“type”: “string”,
“canBeUsedToMatch”: true,
“removed”: false
},
{
“id”: “Spalte 4”,
“displayName”: “Spalte 4”,
“required”: false,
“defaultMatch”: false,
“display”: true,
“type”: “string”,
“canBeUsedToMatch”: true,
“removed”: false
},
{
“id”: “Spalte 5”,
“displayName”: “Spalte 5”,
“required”: false,
“defaultMatch”: false,
“display”: true,
“type”: “string”,
“canBeUsedToMatch”: true,
“removed”: true
},
{
“id”: “Spalte 6”,
“displayName”: “Spalte 6”,
“required”: false,
“defaultMatch”: false,
“display”: true,
“type”: “string”,
“canBeUsedToMatch”: true,
“removed”: true
}
],
“attemptToConvertTypes”: false,
“convertFieldsToString”: false
},
“options”: {}
},
“id”: “c6a93937-b830-4e01-914d-aad2e0cd0c78”,
“name”: “Save emails to Google Sheet”,
“type”: “n8n-nodes-base.googleSheets”,
“typeVersion”: 4.5,
“position”: [
680,
0
],
“credentials”: {
“googleSheetsOAuth2Api”: {
“id”: “8CqcSxQ71MZQGx4z”,
“name”: “Google Sheets account 2”
}
}
}
],
“pinData”: {},
“connections”: {
“Remove Duplicate URLs”: {
“main”: [
[
{
“node”: “Loop over URLs”,
“type”: “main”,
“index”: 0
}
]
]
},
“Search Google Maps with query”: {
“main”: [
[
{
“node”: “Scrape URLs from results”,
“type”: “main”,
“index”: 0
}
]
]
},
“Scrape URLs from results”: {
“main”: [
[
{
“node”: “Filter irrelevant URLs”,
“type”: “main”,
“index”: 0
}
]
]
},
“Filter irrelevant URLs”: {
“main”: [
[
{
“node”: “Remove Duplicate URLs”,
“type”: “main”,
“index”: 0
}
]
]
},
“Workflow Input Trigger”: {
“main”: [
[
{
“node”: “Search Google Maps with query”,
“type”: “main”,
“index”: 0
}
]
]
},
“Scrape HomePage”: {
“main”: [
[
{
“node”: “Loop over URLs”,
“type”: “main”,
“index”: 0
}
]
]
},
“Remove duplicate entries”: {
“main”: [
[
{
“node”: “Save emails to Google Sheet”,
“type”: “main”,
“index”: 0
}
]
]
},
“Loop over URLs”: {
“main”: [
[
{
“node”: “Clean Emails”,
“type”: “main”,
“index”: 0
}
],
[
{
“node”: “Request web page for URL”,
“type”: “main”,
“index”: 0
}
]
]
},
“Request web page for URL”: {
“main”: [
[
{
“node”: “Scrape HomePage”,
“type”: “main”,
“index”: 0
}
]
]
},
“Clean Emails”: {
“main”: [
[
{
“node”: “Remove duplicate entries”,
“type”: “main”,
“index”: 0
}
]
]
}
},
“active”: false,
“settings”: {
“executionOrder”: “v1”
},
“versionId”: “2afda3b7-cb24-4c42-b725-0f1bb12b15fa”,
“meta”: {
“templateCredsSetupCompleted”: true,
“instanceId”: “45568561f2a2daf5563323031b45fddb0d7855d0c695f3d268dc2ef7b4099a5d”
},
“id”: “16TaPFJXMX6bugKP”,
“tags”:
}