【发布时间】:2022-01-16 13:11:27
【问题描述】:
这是我的第一个节点项目,所以很简单,我有这个使用 puppeteer 的刮刀。它的第一次迭代运行良好,但我想进一步模块化它。我已经把它带到了一个感觉不错的地方,但现在我正在运行它,它在刮掉大约 4 个 url 后崩溃(大约 100 个)
UnhandledPromiseRejectionWarning: ProtocolError: Protocol error (Target.createTarget): Target closed.
通过我的日志语句,我的等待似乎发生了一些事情,但我似乎找不到问题所在。它不是按顺序循环,也不是在开始下一个循环迭代之前等待第一个循环迭代完成,并且它试图一次爬取两个站点。我已经看到了有关该特定错误的一些信息,但没有与我的情况有关,所以我假设是我。
另外,旧版本在 Promise 中少了一个循环,所以这可能是罪魁祸首
const scraperObject = {
async scraper(browser, queryString, scraperProps){
let scrapedData = [];
let queryResults = [];
let page = await browser.newPage();
for (const prop of scraperProps) {
let result = Object.assign({}, prop);
await page.goto(prop.queryUrl+queryString);
await page.waitForSelector(prop.selector);
urls = await page.$$eval(prop.selector, links => {
links = links.map(el => el.href.split('?')[0]); // remove params
return [...new Set(links)]; //dedupe (to remove "promoted" listings that may appear twice)
});
result.resultUrls = urls;
queryResults.push(result);
}
// Loop through each of those links, open a new page instance and get the relevant data from them
// This may not be necessary since the search results page potentially could have most of the info needed, and wouldnt really work for a UI
let resultPagePromise = (res) => new Promise(async(resolve, reject) => {
for(const resUrl of res.resultUrls) {
console.log(resUrl);
let dataObj = {};
let newPage = await browser.newPage();
await newPage.goto(resUrl);
dataObj['listingUrl'] = resUrl;
dataObj['title'] = await newPage.$eval(res.titleSelector, text => text.textContent);
dataObj['condition'] = await newPage.$eval(res.conditionSelector, text => text.textContent);
if(res.site === 'ebay') {
let iframeGetter = await newPage.waitForSelector('iframe#desc_ifr');
let frame = await iframeGetter.contentFrame();
await frame.waitForSelector(res.descriptionSelector);
dataObj['description'] = await frame.$eval(res.descriptionSelector, text => text.textContent.replace(/(\r\n\t|\n|\r|\t)/gm, ""));
} else {
dataObj['description'] = await newPage.$eval(res.descriptionSelector, text => text.textContent.replace(/(\r\n\t|\n|\r|\t)/gm, ""));
}
dataObj['price'] = await newPage.$eval(res.priceSelector, text => text.textContent);
dataObj['location'] = await newPage.$eval(res.locationSelector, text => text.textContent);
dataObj['imageUrl'] = await newPage.$eval(res.imageUrlSelector, img => img.src);
resolve(dataObj);
await newPage.close();
}
});
for(const res of queryResults){
console.log(res);
let currentPageData = await resultPagePromise(res);
scrapedData.push(currentPageData);
}
await page.close();
console.log(scrapedData);
return scrapedData;
}
}
module.exports = scraperObject;
【问题讨论】: