Hi guys,
I am trying to avoid using a Webscraper dedicated tool (like Webscraper.io), both for financial and technical reasons (I would have to create the webscraper via an API, download the results file, parse it etc …).
I have created a workflow with http nodes and html extract to get the data.
But the website that I am scraping is blocking me and asking for a captcha.
Would you know if it’s possible to use a proxy url in the http node and solve the captcha ?
Regards
{
"nodes": [
{
"parameters": {
"dataPropertyName": "data_brand",
"extractionValues": {
"values": [
{
"key": "watch_url",
"cssSelector": ".article-item-container",
"returnValue": "html",
"returnArray": true
}
]
},
"options": {}
},
"name": "HTML Extract",
"type": "n8n-nodes-base.htmlExtract",
"typeVersion": 1,
"position": [
730,
300
]
},
{
"parameters": {
"dataPropertyName": "watch_url",
"extractionValues": {
"values": [
{
"key": "link",
"cssSelector": "a",
"returnValue": "attribute",
"attribute": "href"
}
]
},
"options": {}
},
"name": "HTML Extract1",
"type": "n8n-nodes-base.htmlExtract",
"typeVersion": 1,
"position": [
900,
300
]
},
{
"parameters": {
"url": "https://www.chrono24.fr/rolex/index.htm",
"responseFormat": "string",
"dataPropertyName": "data_brand",
"options": {}
},
"name": "HTTP Request1",
"type": "n8n-nodes-base.httpRequest",
"typeVersion": 1,
"position": [
550,
300
]
}
],
"connections": {
"HTML Extract": {
"main": [
[
{
"node": "HTML Extract1",
"type": "main",
"index": 0
}
]
]
},
"HTTP Request1": {
"main": [
[
{
"node": "HTML Extract",
"type": "main",
"index": 0
}
]
]
}
}
}