使用 node.js 提取所有超链接（来自外部网站）并请求答案

【问题标题】：extract all hyperlinks ( from external website ) using node.js and request使用 node.js 提取所有超链接（来自外部网站）并请求
【发布时间】：2013-02-26 21:46:37
【问题描述】：

现在我们的应用程序将 nodejs.org 的源代码写入控制台。我们希望它改为编写 nodejs.org 的所有超链接。也许我们只需要一行代码就可以从body 获取链接。

app.js：

var http = require('http');

http.createServer(function (req, res) {
    res.writeHead(200, {'Content-Type': 'text/plain'});
    res.end('Hello World\n');
}).listen(1337, '127.0.0.1');
console.log('Server running at http://127.0.0.1:1337/');

var request = require("request");



request("http://nodejs.org/", function (error, response, body) {
    if (!error)
        console.log(body);
    else
        console.log(error);
});

【问题讨论】：

标签： javascript node.js request

【解决方案1】：

package.json

    {
      "name": "url_extractor",
      "version": "1.0.0",
      "description": "tool to extract all urls from website",
      "main": "index.js",
      "scripts": {
        "start": "node index.js",
        "test": "echo \"Error: no test specified\" && exit 1"
      },
      "author": "sandip shelke",
      "license": "ISC",
      "dependencies": {
        "axios": "^0.24.0",
        "cheerio": "^1.0.0-rc.10"
      }
    }

索引.js

        const axios = require('axios');
        var cheerio = require('cheerio');

        var baseUrl = 'target website base url';

        (async () => {
            
            try 
            {
                let homePageLinks = await getLinksFromURL(baseUrl)
                console.log(homePageLinks);
            } catch (e) { console.log(e); }

        })();



        async function getLinksFromURL(url) {

            try {
                let links = [];
                let httpResponse = await axios.get(url);

                let $ = cheerio.load(httpResponse.data);
                let linkObjects = $('a'); // get all hyperlinks

                linkObjects.each((index, element) => {
                    links.push({
                        text: $(element).text(), // get the text
                        href: $(element).attr('href'), // get the href attribute
                    });
                });

                return links;
            } catch (e) { console.log(e) }

        }

此代码仅从主页获取链接，递归运行以加载网页中的所有链接。

考虑到你已经安装了节点，运行 npm install 然后 npm start 运行上面的代码。

【讨论】：

【解决方案2】：

您可能正在寻找jsdom、jquery 或cheerio。您所做的称为屏幕抓取，即从站点中提取数据。 jsdom/jquery 提供了完整的工具集，但是cheerio 更快。

这是一个欢呼的例子：

var request = require('request');
var cheerio = require('cheerio');
var searchTerm = 'screen+scraping';
var url = 'http://www.bing.com/search?q=' + searchTerm;
request(url, function(err, resp, body){
  $ = cheerio.load(body);
  links = $('a'); //jquery get all hyperlinks
  $(links).each(function(i, link){
    console.log($(link).text() + ':\n  ' + $(link).attr('href'));
  });
});

你选择最适合你的。

【讨论】：

有没有办法通过cheerio获取document.links结果？
你不能直接调用links.each并在函数内部使用$(this).text()和$(this).attr('href')吗？至少在cheerio 这有效。
如果要获取绝对url，请使用url.resolve(crawl_url, $(link).attr('href'))