【问题标题】:Puppeteer looping through array of URLS to visit next page not workingPuppeteer 循环遍历 URL 数组以访问下一页不起作用
【发布时间】:2020-09-19 15:00:43
【问题描述】:

我试图让 Puppeteer 访问数组中的多个 url,但它似乎不起作用。它永远挂起。

这是似乎不起作用的 sn-p。请注意使用模板文字的 URL。

  //create array of urls
  let urlList = [];

  //add urls to array

  for (let i = 0; i < pageNumberToNumber; i++) {
    urlList.push(
      `https://www.realtor.ca/map#ZoomLevel=14&Center=43.771556%2C-79.427630&LatitudeMax=43.78395&LongitudeMax=-79.38710&LatitudeMin=43.75916&LongitudeMin=-79.46816&view=list&CurrentPage=${i}&Sort=6-D&PGeoIds=g10_dpz90869&GeoName=Willowdale%2C%20Toronto%2C%20ON&PropertyTypeGroupID=1&PropertySearchTypeId=1&TransactionTypeId=2&PriceMax=1500000&BuildingTypeId=1&Currency=CAD`
    );
  }

  console.log(urlList);
  console.log(urlList.length);

  for (let i = 0; i < urlList.length; i++) {
    const url = urlList[i];
    await page.goto(`${url}`);
    await page.waitForNavigation({ waitUntil: "networkidle2" });
  }

这是完整的代码

const puppeteer = require("puppeteer-extra");
// Enable stealth plugin with all evasions
puppeteer.use(require("puppeteer-extra-plugin-stealth")());

(async () => {
  const browser = await puppeteer.launch({
    headless: false,
  });
  const page = await browser.newPage();

  let url =
    "https://www.realtor.ca/map#ZoomLevel=14&Center=43.771556%2C-79.427630&LatitudeMax=43.78395&LongitudeMax=-79.38710&LatitudeMin=43.75916&LongitudeMin=-79.46816&view=list&CurrentPage=1&Sort=6-D&PGeoIds=g10_dpz90869&GeoName=Willowdale%2C%20Toronto%2C%20ON&PropertyTypeGroupID=1&PropertySearchTypeId=1&TransactionTypeId=2&PriceMax=1500000&BuildingTypeId=1&Currency=CAD";

  await page.goto(url);
  await page.waitFor(10000);

  //page.evaluate lets you use JavaScript as it was being used in the developer console on the page
  const result = await page.evaluate(() => {
    let data = []; // Create an empty array that will store our data
    let elements = document.querySelectorAll(".cardCon"); // Select all Products

    for (var element of elements) {
      // Loop through each product
      let price =
        element.children[0].children[1].childNodes[1].children[1].childNodes[1]
          .childNodes[3].innerText; // Select the price
      let address =
        element.children[0].children[1].childNodes[1].children[1].childNodes[1]
          .childNodes[5].innerText; // Select the address
      let details = element.children[0].children[1].childNodes[1].children[1].childNodes[3].innerText.replace(
        /\n/g,
        " "
      ); // Select the details
      let url = element.children[0].childNodes[3].href; // Select the address

      data.push({ price, details, address, url }); // Push an object with the data onto our array
    }

    return data; // Return our data array
  });

  console.log(result);

  //get total page number
  let totalPageNumber = await page.$eval(
    "#ListViewPagination_Bottom > div > div > div > span.paginationTotalPagesNum",
    (element) => element.innerText
  );

  console.log(totalPageNumber);

  //convert total page number from string to number
  let pageNumberToNumber = parseInt(totalPageNumber);

  //create array of urls
  let urlList = [];

  //add urls to array

  for (let i = 0; i < pageNumberToNumber; i++) {
    urlList.push(
      `https://www.realtor.ca/map#ZoomLevel=14&Center=43.771556%2C-79.427630&LatitudeMax=43.78395&LongitudeMax=-79.38710&LatitudeMin=43.75916&LongitudeMin=-79.46816&view=list&CurrentPage=${i}&Sort=6-D&PGeoIds=g10_dpz90869&GeoName=Willowdale%2C%20Toronto%2C%20ON&PropertyTypeGroupID=1&PropertySearchTypeId=1&TransactionTypeId=2&PriceMax=1500000&BuildingTypeId=1&Currency=CAD`
    );
  }

  console.log(urlList);
  console.log(urlList.length);

  for (let i = 0; i < urlList.length; i++) {
    const url = urlList[i];
    await page.goto(`${url}`);
    await page.waitForNavigation({ waitUntil: "networkidle2" });
  }
})();

【问题讨论】:

    标签: javascript arrays loops web-scraping puppeteer


    【解决方案1】:

    当您的goto 完成时,您将已经导航,因此您的await page.waitForNavigation({ waitUntil: "networkidle2" }); 永远不会检测到导航事件。

    在您的情况下,您可以在您的 goto 调用中添加一个 options argument 来决定您的 waitUntil 策略。

    替换这个

    await page.goto(`${url}`);
    await page.waitForNavigation({ waitUntil: "networkidle2" });
    

    有了这个

    await page.goto(`${url}`, { waitUntil: "networkidle2" });
    

    简单示例: 请注意,我为每个 URL 创建了一个新页面,以避免您在使用 goto 这个特定网站时遇到的问题。

    const puppeteer = require("puppeteer-extra");
    // Enable stealth plugin with all evasions
    puppeteer.use(require("puppeteer-extra-plugin-stealth")());
    
    (async function () {
      const browser = await puppeteer.launch({ headless: false });
      const page = await browser.newPage();
    
      //create array of urls
      let urlList = [];
    
      const pageNumberToNumber = 3; // Inserted by me
    
      console.log(urlList);
      //add urls to array
      for (let i = 1; i <= pageNumberToNumber; i++) {
        console.log(urlList);
        urlList.push(
          `https://www.realtor.ca/map#ZoomLevel=14&Center=43.771556%2C-79.427630&LatitudeMax=43.78395&LongitudeMax=-79.38710&LatitudeMin=43.75916&LongitudeMin=-79.46816&view=list&CurrentPage=${i}&Sort=6-D&PGeoIds=g10_dpz90869&GeoName=Willowdale%2C%20Toronto%2C%20ON&PropertyTypeGroupID=1&PropertySearchTypeId=1&TransactionTypeId=2&PriceMax=1500000&BuildingTypeId=1&Currency=CAD`
        );
      }
    
      console.log(urlList);
      console.log(urlList.length);
    
      for (let i = 0; i < urlList.length; i++) {
        // Creating new page to load each url.
        const tempPage = await browser.newPage();
        const url = urlList[i];
        await tempPage.goto(`${url}`, { waitUntil: "networkidle2" });
        // 5s wait to manually confirm page changed
        await tempPage.waitFor(5000);
        await tempPage.close();
      }
      await browser.close();
    })();
    

    或者,您可以将两个调用(并首先调用 waitForNavigation)包装在 Promise.all 中。

    【讨论】:

    • 谢谢,浏览器现在只是无限期地停留在第一页,永远不会导航到下一页。
    • 您提供的网址是从哪里得到的?我的代码中没有看到它。
    • 抱歉从模板字符串中粘贴了错误的 URL...我更新了示例以使用您的 URL。让我知道进展如何,现在我投入了情感:)
    • 不幸的是我的朋友运气不好。
    • 根据您最初发布的完整代码示例,它控制台记录了 3 个 URL。第一个是第 0 页,没有结果。你可能有一个一次性的错误。您将需要修理柜台。但是,即使添加了这一点,您的完整代码示例实际上并没有像您之前提到的那样转到。我不确定为什么您的示例中的 goto 似乎不起作用...可以选择单击页面并收集数据吗?否则我今晚晚些时候再看看:)
    猜你喜欢
    • 1970-01-01
    • 2020-02-12
    • 2019-02-18
    • 1970-01-01
    • 2020-12-18
    • 1970-01-01
    • 2016-02-04
    • 1970-01-01
    • 1970-01-01
    相关资源
    最近更新 更多