【发布时间】:2020-09-19 15:00:43
【问题描述】:
我试图让 Puppeteer 访问数组中的多个 url,但它似乎不起作用。它永远挂起。
这是似乎不起作用的 sn-p。请注意使用模板文字的 URL。
//create array of urls
let urlList = [];
//add urls to array
for (let i = 0; i < pageNumberToNumber; i++) {
urlList.push(
`https://www.realtor.ca/map#ZoomLevel=14&Center=43.771556%2C-79.427630&LatitudeMax=43.78395&LongitudeMax=-79.38710&LatitudeMin=43.75916&LongitudeMin=-79.46816&view=list&CurrentPage=${i}&Sort=6-D&PGeoIds=g10_dpz90869&GeoName=Willowdale%2C%20Toronto%2C%20ON&PropertyTypeGroupID=1&PropertySearchTypeId=1&TransactionTypeId=2&PriceMax=1500000&BuildingTypeId=1&Currency=CAD`
);
}
console.log(urlList);
console.log(urlList.length);
for (let i = 0; i < urlList.length; i++) {
const url = urlList[i];
await page.goto(`${url}`);
await page.waitForNavigation({ waitUntil: "networkidle2" });
}
这是完整的代码
const puppeteer = require("puppeteer-extra");
// Enable stealth plugin with all evasions
puppeteer.use(require("puppeteer-extra-plugin-stealth")());
(async () => {
const browser = await puppeteer.launch({
headless: false,
});
const page = await browser.newPage();
let url =
"https://www.realtor.ca/map#ZoomLevel=14&Center=43.771556%2C-79.427630&LatitudeMax=43.78395&LongitudeMax=-79.38710&LatitudeMin=43.75916&LongitudeMin=-79.46816&view=list&CurrentPage=1&Sort=6-D&PGeoIds=g10_dpz90869&GeoName=Willowdale%2C%20Toronto%2C%20ON&PropertyTypeGroupID=1&PropertySearchTypeId=1&TransactionTypeId=2&PriceMax=1500000&BuildingTypeId=1&Currency=CAD";
await page.goto(url);
await page.waitFor(10000);
//page.evaluate lets you use JavaScript as it was being used in the developer console on the page
const result = await page.evaluate(() => {
let data = []; // Create an empty array that will store our data
let elements = document.querySelectorAll(".cardCon"); // Select all Products
for (var element of elements) {
// Loop through each product
let price =
element.children[0].children[1].childNodes[1].children[1].childNodes[1]
.childNodes[3].innerText; // Select the price
let address =
element.children[0].children[1].childNodes[1].children[1].childNodes[1]
.childNodes[5].innerText; // Select the address
let details = element.children[0].children[1].childNodes[1].children[1].childNodes[3].innerText.replace(
/\n/g,
" "
); // Select the details
let url = element.children[0].childNodes[3].href; // Select the address
data.push({ price, details, address, url }); // Push an object with the data onto our array
}
return data; // Return our data array
});
console.log(result);
//get total page number
let totalPageNumber = await page.$eval(
"#ListViewPagination_Bottom > div > div > div > span.paginationTotalPagesNum",
(element) => element.innerText
);
console.log(totalPageNumber);
//convert total page number from string to number
let pageNumberToNumber = parseInt(totalPageNumber);
//create array of urls
let urlList = [];
//add urls to array
for (let i = 0; i < pageNumberToNumber; i++) {
urlList.push(
`https://www.realtor.ca/map#ZoomLevel=14&Center=43.771556%2C-79.427630&LatitudeMax=43.78395&LongitudeMax=-79.38710&LatitudeMin=43.75916&LongitudeMin=-79.46816&view=list&CurrentPage=${i}&Sort=6-D&PGeoIds=g10_dpz90869&GeoName=Willowdale%2C%20Toronto%2C%20ON&PropertyTypeGroupID=1&PropertySearchTypeId=1&TransactionTypeId=2&PriceMax=1500000&BuildingTypeId=1&Currency=CAD`
);
}
console.log(urlList);
console.log(urlList.length);
for (let i = 0; i < urlList.length; i++) {
const url = urlList[i];
await page.goto(`${url}`);
await page.waitForNavigation({ waitUntil: "networkidle2" });
}
})();
【问题讨论】:
标签: javascript arrays loops web-scraping puppeteer