Extract first column of an HTML Table

Hi

I want to extract the first column of an HTML table from
https://noc.rub.de/cgi-bin/showextblocked

(Note: The Network Operation Center of the Ruhr-Universität Bochum publishes IP addresses from which they are attacked and which they have blocked)

in a later version of the Workflow the wf shall initiate a failban command …

I get back

{
	"item": [
		"<th>IP-Adresse / Subnetz</th>\n<th>Grund</th>\n<th>Port</th>\n<th>Anzahl</th>\n<th>Zeitstempel</th>",
		
		"<td>240e:f7:4f01:c::3</td>
		<td>RDP attempts</td>
		<td align=\"right\"></td>
		<td align=\"right\">2683</td>
		<td>2021-08-15 14:28:03</td>",
		
		"<td>&nbsp;</td>
		<td>SSH scans</td>
		<td align=\"right\"></td>
		<td align=\"right\">2798</td>
		<td>2021-08-15 14:28:05</td>",
		
		"<td>&nbsp;</td>
		<td>Web scans</td>
		<td align=\"right\"></td>
		<td align=\"right\">2624</td>
		<td>2021-08-15 14:28:07</td>",
		
		"<td>&nbsp;</td>
		<td>TCP scans</td>
		<td align=\"right\"></td>
		<td align=\"right\">15751</td>
		<td>2021-08-15 14:29:31</td>",
		
		"<td>&nbsp;</td>
		<td>TCP scans (by ports)</td>
		<td align=\"right\">22</td>
		<td align=\"right\">2798</td>
		<td>2021-08-15 14:31:01</td>",
		
		"<td>2620:96:a000::5</td>
		<td>SSH scans</td>
		<td align=\"right\">
		</td><td align=\"right\">46</td>
		<td>2021-08-16 07:44:09</td>",
		
		"<td>2001:4ca0:108:42:0:80:6:9</td>
		<td>Web scans</td>
		<td align=\"right\"></td>
		<td align=\"right\">149</td>
		<td>2021-08-16 10:14:08</td>"
...

Could someone help me please with the following questions

  1. How can I get rid of the first line with <th> I look for <tr>. string replace? RegExp?
  2. for MVP - how can I get only the first column if it does NOT contain “&nbsp;”? RegExp?
  3. after MVP - how can I get a nested json like?
{
	"item": [
		"IP": "240e:f7:4f01:c::3"
		"Port": "22",
		"REASONs": [
			"REASON": [
				"description": "RDP attempts",
				"Trials": "2614",
				"Timestamp": "2021-08-15 14:28:03", 
			],
			"REASON": [
				"description": "SSH scans",
				"Port": "22",
				"Trials": "2711",
				"Timestamp": "2021-08-15 14:28:03", 
			],
			...
			],
		],[		[
		"IP": "2620:96:a000::5"
		"Port": "",
		"REASONs": [
			"REASON": [
				"description": "SSH scans",
				"Port": "22",
				"Trials": "2711",
				"Timestamp": "2021-08-15 14:28:03", 
			],
			...

here is my workflow

{
  "name": "ENV_Failban_IP_RUB",
  "nodes": [
    {
      "parameters": {},
      "name": "Start",
      "type": "n8n-nodes-base.start",
      "typeVersion": 1,
      "position": [
        -230,
        200
      ]
    },
    {
      "parameters": {
        "path": "FailBanRUB",
        "responseMode": "lastNode",
        "options": {}
      },
      "name": "Webhook",
      "type": "n8n-nodes-base.webhook",
      "typeVersion": 1,
      "position": [
        -30,
        200
      ],
      "webhookId": "f497a8f3-6bfd-40d3-a900-d159ca935da1"
    },
    {
      "parameters": {
        "url": "https://noc.rub.de/cgi-bin/showextblocked",
        "responseFormat": "string",
        "options": {}
      },
      "name": "GET Blocked",
      "type": "n8n-nodes-base.httpRequest",
      "typeVersion": 1,
      "position": [
        200,
        200
      ]
    },
    {
      "parameters": {
        "extractionValues": {
          "values": [
            {
              "key": "item",
              "cssSelector": "tr",
              "returnValue": "html",
              "returnArray": true
            }
          ]
        },
        "options": {}
      },
      "name": "Table HTML Extract",
      "type": "n8n-nodes-base.htmlExtract",
      "typeVersion": 1,
      "position": [
        440,
        200
      ]
    }
  ],
  "connections": {
    "Webhook": {
      "main": [
        [
          {
            "node": "GET Blocked",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "GET Blocked": {
      "main": [
        [
          {
            "node": "Table HTML Extract",
            "type": "main",
            "index": 0
          }
        ]
      ]
    }
  },
  "active": false,
  "settings": {},
  "id": 13
}

This is how the website lookas at the moment

Hey @ico,

I don’t see any tr tags in the output that you have shared. What result do you get? I would suggest you instead use the td tag. You can use something like td:nth-child(4) in the CSS Selector field to get the values for the particular column.

Using the CSS selector mentioned above you will be able to extract the information with the Key you want. Once you have the value, you can format it using either the Set node or the Function node.

1 Like

td:nth-child(4) should do it - i will give it a try and report back - thanks rob

You might want to change the value based on which element you want to refer to. Here 4 will return the 4th item.

i know :slight_smile:

working ! thanks
Now i need to clean `the result . How can I do that? Function node?

  1. delete “&nbsp;”,
  2. delete <b> and </b>
{
	"IP": [
		"240e:f7:4f01:c::3",
		"&nbsp;",
		"&nbsp;",
		"&nbsp;",
		"&nbsp;",
		"2620:96:a000::5",
		"&nbsp;",
		"&nbsp;",
			"222.77.182.0<b>/24</b>",
		"222.186.62.0<b>/24</b>",
		"222.216.29.52<b>/31</b>",
		"222.222.243.121",
		"223.4.19.71",
		"223.4.216.108",
		"223.19.51.247",
		"223.204.165.95"
	]
}
``

The function node below should do it.

const ips = items[0].json.IP

const _ips = []

for (let ip of ips) {
  if (ip !== '&nbsp;') {
    ip = ip.replace('<b>', '').replace('</b>', '');
    _ips.push(ip)
  }
}

return [
  {
    json: {
      ips: _ips,
    }
  }
]
Example workflow
{
  "nodes": [
    {
      "parameters": {},
      "name": "Start",
      "type": "n8n-nodes-base.start",
      "typeVersion": 1,
      "position": [
        250,
        300
      ]
    },
    {
      "parameters": {
        "functionCode": "return [\n    {\n\n    json: {\n      \t\"IP\": [\n\t\t\"240e:f7:4f01:c::3\",\n\t\t\"&nbsp;\",\n\t\t\"&nbsp;\",\n\t\t\"&nbsp;\",\n\t\t\"&nbsp;\",\n\t\t\"2620:96:a000::5\",\n\t\t\"&nbsp;\",\n\t\t\"&nbsp;\",\n\t\t\t\"222.77.182.0<b>/24</b>\",\n\t\t\"222.186.62.0<b>/24</b>\",\n\t\t\"222.216.29.52<b>/31</b>\",\n\t\t\"222.222.243.121\",\n\t\t\"223.4.19.71\",\n\t\t\"223.4.216.108\",\n\t\t\"223.19.51.247\",\n\t\t\"223.204.165.95\"\n\t  ]\n    }\n  }\n]"
      },
      "name": "Function",
      "type": "n8n-nodes-base.function",
      "typeVersion": 1,
      "position": [
        640,
        310
      ]
    },
    {
      "parameters": {
        "functionCode": "const ips = items[0].json.IP\n\nconst _ips = []\n\nfor (let ip of ips) {\n  if (ip !== '&nbsp;') {\n    ip = ip.replace('<b>', '').replace('</b>', '');\n    _ips.push(ip)\n  }\n}\n\nreturn [\n  {\n    json: {\n      ips: _ips,\n    }\n  }\n]"
      },
      "name": "Function1",
      "type": "n8n-nodes-base.function",
      "typeVersion": 1,
      "position": [
        880,
        310
      ]
    }
  ],
  "connections": {
    "Start": {
      "main": [
        [
          {
            "node": "Function",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Function": {
      "main": [
        [
          {
            "node": "Function1",
            "type": "main",
            "index": 0
          }
        ]
      ]
    }
  }
}

hi Rciardo
thanks - works - almost looks like i have an new line or something else
modified the replace to

ip = ip.replace('<b>', '').replace('</b>', '').replace('\n', '').replace('\r', '');

but it still looks like

any more hints?
rob

Forget it - i think the raw data has no /n or /r - i it just the browser … in postman it looks ok . but to be sure i added

ip = ip.replace(/[^\x20-\x7E]/g, '');

source: JavaScript: Remove non-printable ASCII chars - w3resource

1 Like

Output should look like - for step 3 …

[{“ip”:“240e:f7:4f01:c::3”},{“ip”:“2001:4ca0:108:42:0:80:6:9”},{“ip”:“2620:96:a000::5”},{“ip”:“1.0.248.198”},{“ip”:“1.14.132.28”}, … ]

It looks like

  {"_ips":["240e:f7:4f01:c::3","2001:4ca0:108:42:0:80:6:9","2620:96:a000::5","1.0.248.198","1.14.132.28", …]

could someone give me another tip. I know it is JavaScript (nit n8n) , but JS is new to me. I have already started attempts with infos I found here https://mossgreen.github.io/JSON-manipulation-with-javascript/:
But I was not successful. I would appreciate further help - rob

To do that check the function node below.

const ips = items[0].json.IP

const _ips = []

for (let ip of ips) {
  if (ip !== '&nbsp;') {
    ip = ip.replace('<b>', '').replace('</b>', '');
    _ips.push({
      json: {
        ip
      }
    })
  }
}

return _ips;
Example workflow
{
  "nodes": [
    {
      "parameters": {},
      "name": "Start",
      "type": "n8n-nodes-base.start",
      "typeVersion": 1,
      "position": [
        250,
        300
      ]
    },
    {
      "parameters": {
        "functionCode": "return [\n    {\n\n    json: {\n      \t\"IP\": [\n\t\t\"240e:f7:4f01:c::3\",\n\t\t\"&nbsp;\",\n\t\t\"&nbsp;\",\n\t\t\"&nbsp;\",\n\t\t\"&nbsp;\",\n\t\t\"2620:96:a000::5\",\n\t\t\"&nbsp;\",\n\t\t\"&nbsp;\",\n\t\t\t\"222.77.182.0<b>/24</b>\",\n\t\t\"222.186.62.0<b>/24</b>\",\n\t\t\"222.216.29.52<b>/31</b>\",\n\t\t\"222.222.243.121\",\n\t\t\"223.4.19.71\",\n\t\t\"223.4.216.108\",\n\t\t\"223.19.51.247\",\n\t\t\"223.204.165.95\"\n\t  ]\n    }\n  }\n]"
      },
      "name": "Function",
      "type": "n8n-nodes-base.function",
      "typeVersion": 1,
      "position": [
        520,
        310
      ]
    },
    {
      "parameters": {
        "functionCode": "const ips = items[0].json.IP\n\nconst _ips = []\n\nfor (let ip of ips) {\n  if (ip !== '&nbsp;') {\n    ip = ip.replace('<b>', '').replace('</b>', '');\n    _ips.push({\n      json: {\n        ip\n      }\n    })\n  }\n}\n\nreturn _ips;"
      },
      "name": "Function1",
      "type": "n8n-nodes-base.function",
      "typeVersion": 1,
      "position": [
        760,
        310
      ]
    }
  ],
  "connections": {
    "Start": {
      "main": [
        [
          {
            "node": "Function",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Function": {
      "main": [
        [
          {
            "node": "Function1",
            "type": "main",
            "index": 0
          }
        ]
      ]
    }
  }
}

This has just saved me so much time, I’ve been struggling for a while to get this one line of information I needed.

Any chance you can flesh out the documentation for the HMTL extract node this is sort of information on what can and can’t be grabbed via the CSS selection part?

1 Like

Like the idea! Adding @gabriel, who looks after our docs

3 Likes