Hi there,
So I am trying to create this email scraping tool. While I managed to set up a scraping workflow that scrapes the emails I cannot get anything else. I need to also be able to add the website url, which I get out of my code anyways, the phone number, company name, address and if possible also the name of the owner of the business. Now this is all very new to me, but I would like to be able to do it just with HTTP request and without using any external API programs and simply use the input urls for the Google Maps area and in the end have a google spreadsheet with all of the variables mentioned above.
Something I also noticed is that I am getting sometimes after some time a HTTP block and it says: "Problem in node âHTTP Requestâ
The service is receiving too many requests from you"
Any idea how this can also be solved/avoided/fixed
This is the code I have been working with so far:
name "B2B Email Scraper"
nodes
0
parameters {}
type "n8n-nodes-base.manualTrigger"
typeVersion 1
position
0 -2336
1 256
id "e8f3fb60-6cc8-4389-ac72-75f8b15831ba"
name "When clicking âExecute workflowâ"
1
parameters
url "={{ $json.urllist }}"
options {}
type "n8n-nodes-base.httpRequest"
typeVersion 4.2
position
0 -1664
1 272
id "dae4272b-4b74-4cac-b565-fcc50153c998"
name "HTTP Request"
2
parameters
jsCode 'const input = $input.first().json.data\nconst regex = /https?:\\/\\/([^\\/\\s"]+)/g\nconst websites = input.match(regex);\n\nreturn websites.map(website => ({json:{website}}))\n\n'
type "n8n-nodes-base.code"
typeVersion 2
position
0 -1456
1 272
id "9d658d95-8e20-47bb-901f-792d0c816d10"
name "Code"
3
parameters
conditions
options
caseSensitive true
leftValue ""
typeValidation "strict"
version 2
conditions
0
id "93edae54-b050-4b4a-b548-96606853d86f"
leftValue "={{ $json.website }}"
rightValue "schema"
operator
type "string"
operation "notContains"
1
id "8e38ec79-9448-471d-961b-6aa19bea0328"
leftValue "={{ $json.website }}"
rightValue "google"
operator
type "string"
operation "notContains"
2
id "4c0241c6-5907-4a23-9c6e-1000c63153db"
leftValue "={{ $json.website }}"
rightValue "ggpht"
operator
type "string"
operation "notContains"
3
id "5fe3863f-85d3-4e81-9183-11cdaed9a3e6"
leftValue "={{ $json.website }}"
rightValue "googleapis"
operator
type "string"
operation "notContains"
4
id "c8a16e7e-4169-43c9-9a40-b68a6a4e7834"
leftValue "={{ $json.website }}"
rightValue "gstatic"
operator
type "string"
operation "notContains"
5
id "635b8a06-b8d6-4b62-9f1d-1908b193ee69"
leftValue ""
rightValue ""
operator
type "string"
operation "equals"
name "filter.operator.equals"
combinator "and"
options {}
type "n8n-nodes-base.filter"
typeVersion 2.2
position
0 -1248
1 112
id "32a5c4ef-447c-4ad2-9ed7-9ab85a937716"
name "Filter"
4
parameters
options {}
type "n8n-nodes-base.splitInBatches"
typeVersion 3
position
0 -1040
1 112
id "8704f149-a4fc-484e-a289-832b72456b2c"
name "Loop Over Items"
5
parameters
url "={{ $json.website }}"
options {}
type "n8n-nodes-base.httpRequest"
typeVersion 4.2
position
0 -848
1 208
id "1e0670e9-9c6f-4fdc-863c-336d6e86022f"
name "HTTP Request1"
onError "continueRegularOutput"
6
parameters
amount 1
type "n8n-nodes-base.wait"
typeVersion 1.1
position
0 -528
1 208
id "a2019126-8ce3-4154-bcb4-51ef8a83184f"
name "Wait"
webhookId "27e547b8-8687-4bd1-9582-31c9995eb99c"
7
parameters
jsCode "const input = $input.first().json.data\nconst emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.(?!png|jpg|gif|jpeg)[a-zA-Z]{2,}/g\nconst emails = input.match(emailRegex)\n\nreturn {json: {emails: emails}}"
type "n8n-nodes-base.code"
typeVersion 2
position
0 -688
1 208
id "d389b634-be67-4967-8a5b-1e5008a002b0"
name "Code1"
onError "continueRegularOutput"
8
parameters
amount 1
type "n8n-nodes-base.wait"
typeVersion 1.1
position
0 -864
1 16
id "353f94c7-d7ea-43a8-849f-43d1e849791d"
name "Wait1"
webhookId "bd74a283-73ae-4a80-bf58-ca9186bde540"
9
parameters
fieldToSplitOut "emails"
options {}
type "n8n-nodes-base.splitOut"
typeVersion 1
position
0 -688
1 16
id "98feb439-104e-4a13-962c-f0d47009f93d"
name "Split Out"
10
parameters
conditions
options
caseSensitive true
leftValue ""
typeValidation "strict"
version 2
conditions
0
id "ac7a1d61-56ff-48cd-9903-7070ec5c9a41"
leftValue "={{ $json.emails }}"
rightValue ""
operator
type "string"
operation "exists"
singleValue true
1
id "b71d4d94-bfac-4e26-9085-d191323b1751"
leftValue "={{ $json.emails }}"
rightValue "sentry"
operator
type "string"
operation "notContains"
2
id "41e87b2f-856f-43db-b117-edcdaaac8775"
leftValue "={{ $json.emails }}"
rightValue "sentry.io"
operator
type "string"
operation "notContains"
3
id "1157d58b-fd42-46d9-bf73-116bbbcbeae3"
leftValue "={{ $json.emails }}"
rightValue "sentry-next.wixpress.com"
operator
type "string"
operation "notContains"
4
id "e164e8f1-78b2-4d25-b65c-44e19c3e4dda"
leftValue "={{ $json.emails }}"
rightValue "localsearch.ch"
operator
type "string"
operation "notContains"
5
id "ed49ef40-81bd-4ecf-b91f-248d2ec2dfed"
leftValue "={{ $json.emails }}"
rightValue ".webp"
operator
type "string"
operation "notContains"
combinator "and"
options {}
type "n8n-nodes-base.filter"
typeVersion 2.2
position
0 -512
1 16
id "c50aca99-60e9-4320-97eb-fa053c89fbda"
name "Filter1"
11
parameters
options {}
type "n8n-nodes-base.removeDuplicates"
typeVersion 2
position
0 -336
1 16
id "ff007b70-117a-4e97-bd1e-df3f15b6441d"
name "Remove Duplicates"
12
parameters
operation "appendOrUpdate"
documentId
__rl true
value "1MXb6EZnMvV8MXR-StVaa5rEBrijAcDOVGGi-OQhrz7U"
mode "list"
cachedResultName "B2B Email Scraper"
cachedResultUrl "https://docs.google.com/spreadsheets/d/1MXb6EZnMvV8MXR-StVaa5rEBrijAcDOVGGi-OQhrz7U/edit?usp=drivesdk"
sheetName
__rl true
value "gid=0"
mode "list"
cachedResultName "Tabellenblatt1"
cachedResultUrl "https://docs.google.com/spreadsheets/d/1MXb6EZnMvV8MXR-StVaa5rEBrijAcDOVGGi-OQhrz7U/edit#gid=0"
columns
mappingMode "defineBelow"
value
Email "={{ $json.emails }}"
matchingColumns
0 "Email"
schema
0
id "Email"
displayName "Email"
required false
defaultMatch false
display true
type "string"
canBeUsedToMatch true
removed false
attemptToConvertTypes false
convertFieldsToString false
options {}
type "n8n-nodes-base.googleSheets"
typeVersion 4.6
position
0 -128
1 16
id "4b3b3bf9-492a-4a79-a61d-0130ce7734ba"
name "Append or update row in sheet"
credentials
googleSheetsOAuth2Api
id "LviblX1Q6FpimnWL"
name "Google Sheets account"
13
parameters
jsCode 'const urlList = [\n "https://www.google.com/maps/search/kosmetik+studio+basel/",\n "https://www.google.com/maps/search/beauty+basel/",\n "https://www.google.com/maps/search/spa+basel/",\n "https://www.google.com/maps/search/kosmetik+basel/"\n];\n\nreturn urlList.map(url => {\n return {\n json: {\n urllist: url\n }\n };\n});\n'
type "n8n-nodes-base.code"
typeVersion 2
position
0 -2128
1 256
id "21c23c8c-51c4-46b6-9d8f-1a0ea94b4dd3"
name "Code2"
14
parameters
options {}
type "n8n-nodes-base.splitInBatches"
typeVersion 3
position
0 -1920
1 256
id "b8e11500-e19d-4bae-a53c-9fa3d35df358"
name "Loop Over Items1"
15
parameters
amount 1
type "n8n-nodes-base.wait"
typeVersion 1.1
position
0 -1248
1 272
id "c9865969-1f03-4ac3-b91b-6eab73735677"
name "Wait2"
webhookId "e59e16a5-4b06-4d77-8b91-69bdb060de64"
16
parameters
amount 1
type "n8n-nodes-base.wait"
typeVersion 1.1
position
0 -1712
1 112
id "a01dac6d-1ea8-45cc-9940-36e832230fc8"
name "Wait3"
webhookId "8e9af8e5-a78f-43a1-9059-781bfd5f5c57"
pinData {}
connections
When clicking âExecute workflowâ
main
0
0
node "Code2"
type "main"
index 0
HTTP Request
main
0
0
node "Code"
type "main"
index 0
Code
main
0
0
node "Wait2"
type "main"
index 0
Filter
main
0
0
node "Loop Over Items"
type "main"
index 0
Loop Over Items
main
0
0
node "Wait1"
type "main"
index 0
1
0
node "HTTP Request1"
type "main"
index 0
HTTP Request1
main
0
0
node "Code1"
type "main"
index 0
Code1
main
0
0
node "Wait"
type "main"
index 0
Wait
main
0
0
node "Loop Over Items"
type "main"
index 0
Wait1
main
0
0
node "Split Out"
type "main"
index 0
Split Out
main
0
0
node "Filter1"
type "main"
index 0
Filter1
main
0
0
node "Remove Duplicates"
type "main"
index 0
Remove Duplicates
main
0
0
node "Append or update row in sheet"
type "main"
index 0
Code2
main
0
0
node "Loop Over Items1"
type "main"
index 0
Loop Over Items1
main
0
0
node "Wait3"
type "main"
index 0
1
0
node "HTTP Request"
type "main"
index 0
Wait2
main
0
0
node "Loop Over Items1"
type "main"
index 0
Wait3
main
0
0
node "Filter"
type "main"
index 0
active false
settings
executionOrder "v1"
versionId "7dec2785-68f2-4782-b9b9-b9de17a71c14"
meta
templateCredsSetupCompleted true
instanceId "fea9473386d674cec63cc4af91dbc62f6f1798952d679134d0822b9d35550d02"
id "iHGm41hxIiQqhKLE"
tags []
Additionally I tried using other functions to scrape multiple things all at once but either in the spread out or just the function itself doesnât work:
const html = $input.first().json;
const source = html.data || '';
const result = {
website: $json.website,
companyName: null,
email: null,
phone: null,
address: null,
};
// Company Name (from <title>)
const titleMatch = source.match(/<title>(.*?)<\/title>/i);
if (titleMatch) {
result.companyName = titleMatch[1].trim();
}
// Email
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.(?!jpeg|jpg|png|gif|webp|svg)[a-zA-Z]{2,}/g;
const emails = source.match(emailRegex) || [];
if (emails.length) result.email = [...new Set(emails)][0];
// Phone
const phoneRegex = /(\+?\d[\d\s\-()\/]{6,}\d)/g;
const phones = source.match(phoneRegex) || [];
if (phones.length) result.phone = [...new Set(phones)][0];
// Address (basic pattern match)
const addressRegex = /\d{4,5}\s?[A-Z][a-zA-Z]+(?:strasse|gasse|weg|allee|platz)?[\s,\d\w]*/gi;
const addresses = source.match(addressRegex) || [];
if (addresses.length) result.address = addresses[0];
return { json: result };
Thanks for the help in advance!!!