| import { JSDOM, VirtualConsole } from "jsdom"; |
|
|
| function removeTags(node: Node) { |
| if (node.hasChildNodes()) { |
| node.childNodes.forEach((childNode) => { |
| if (node.nodeName === "SCRIPT" || node.nodeName === "STYLE") { |
| node.removeChild(childNode); |
| } else { |
| removeTags(childNode); |
| } |
| }); |
| } |
| } |
| function naiveInnerText(node: Node): string { |
| const Node = node; |
| return [...node.childNodes] |
| .map((childNode) => { |
| switch (childNode.nodeType) { |
| case Node.TEXT_NODE: |
| return node.textContent; |
| case Node.ELEMENT_NODE: |
| return naiveInnerText(childNode); |
| default: |
| return ""; |
| } |
| }) |
| .join("\n"); |
| } |
|
|
| export async function parseWeb(url: string) { |
| const abortController = new AbortController(); |
| setTimeout(() => abortController.abort(), 10000); |
| const htmlString = await fetch(url, { signal: abortController.signal }) |
| .then((response) => response.text()) |
| .catch((err) => console.log(err)); |
|
|
| const virtualConsole = new VirtualConsole(); |
| virtualConsole.on("error", () => { |
| |
| }); |
|
|
| |
| const dom = new JSDOM(htmlString ?? "", { |
| virtualConsole, |
| }); |
|
|
| const body = dom.window.document.querySelector("body"); |
| if (!body) throw new Error("body of the webpage is null"); |
|
|
| removeTags(body); |
|
|
| |
| const text = (naiveInnerText(body) ?? "").replace(/ {2}|\r\n|\n|\r/gm, ""); |
|
|
| return text; |
| } |
|
|