【发布时间】:2021-01-21 10:27:27
【问题描述】:
我正在尝试抓取一个无限滚动的网站。
我正在控制滚动,但仍然在到达网页末尾后退出。
这是我的代码:
const puppeteer = require("puppeteer");
module.exports.scraper = async (url, callBack) => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.setUserAgent(
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
);
await page.setViewport({ width: 1200, height: 768 });
function wait(ms) {
return new Promise((resolve) => setTimeout(() => resolve(), ms));
}
await page.goto(`${url}/products/?department=men&l2_category=polos-t-shirts`, {
waitUntil: "networkidle0",
});
// Get the height of the rendered page
const bodyHandle = await page.$("body");
const { height } = await bodyHandle.boundingBox();
await bodyHandle.dispose();
// Scroll one viewport at a time, pausing to let content load
const viewportHeight = page.viewport().height;
let viewportIncr = 0;
while (viewportIncr + viewportHeight < height) {
await page.evaluate((_viewportHeight) => {
window.scrollBy(0, _viewportHeight);
}, viewportHeight);
await wait(1600);
viewportIncr = viewportIncr + viewportHeight;
}
let data = await page.evaluate(() => {
window.scrollTo(0, 0);
let products = [];
let productElements = document.querySelectorAll(".product-wrap");
productElements.forEach((productElement) => {
let productJson = {};
try {
productJson.imageUrl = productElement.querySelector(".renderedImg").src;
productJson.brandName = productElement.querySelector(
".brand-name",
).innerText;
} catch (e) {
console.log(e);
}
products.push(productJson);
});
return products;
});
await wait(100);
callBack(data, true);
await browser.close();
};
这种情况怎么刮?
【问题讨论】:
-
“到达网页末尾后退出”是什么意思?它应该做什么呢?我还不清楚您要解决什么问题。
-
我需要我的脚本在滚动到达页面底部后等待它加载内容。我正在抓取具有无限滚动功能的网站。
标签: node.js web-scraping puppeteer