From a06613cd2e12d994cdb0643a5c13e30fe6e24fda Mon Sep 17 00:00:00 2001 From: Edward Peterson Date: Sat, 10 Jun 2023 00:24:50 -0400 Subject: [PATCH] Fixed pages no closing when done parsing Reduced redundant calls to crawled websites. --- crawlers/topclassiccarsforsale.crawler.js | 26 +++++++++++++++++++++++ index.js | 2 +- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/crawlers/topclassiccarsforsale.crawler.js b/crawlers/topclassiccarsforsale.crawler.js index 4b3e156..bc16e3d 100644 --- a/crawlers/topclassiccarsforsale.crawler.js +++ b/crawlers/topclassiccarsforsale.crawler.js @@ -52,6 +52,14 @@ module.exports = { //start with this page, pull all the carname elements for their links let links = await page.$$('.carname'); links = await Promise.all(links.map(async element => await page.evaluate(el => el.href, element))); + + links = await this.filterCompletedLinks(links); + + + console.log(`Found ${links.length} unexplored links...`); + + + for(let i = 0;i < links.length;i++){ let link = links[i]; const newTab = await page.browser().newPage(); @@ -75,5 +83,23 @@ module.exports = { } return candidateLeads; + }, + filterCompletedLinks: function(links){ + return new Promise((resolve, reject) => { + superagent.post('http://localhost:3000/lead/crawler/filterUrls') + .send({ + urls: links + }) + .set('authorization', `Basic ${process.env.crawlerToken}`) + .set('Accept', 'application/json') + .end((err, res) => { + if(err){ + console.error('Failed to filter urls', err); + return reject(err); + } + resolve(res.body?.urls); + }); + }) + } } \ No newline at end of file diff --git a/index.js b/index.js index 28881b0..e0b78fb 100644 --- a/index.js +++ b/index.js @@ -85,7 +85,7 @@ async function run(url, processor) { const mileage = await processor.parseMileage(page); console.log('parsed Mileage:', mileage); const galleryUrls = await processor.execute(page); - + await page.close(); console.log('Done collecting URLS', galleryUrls.length); const payloads = await Promise.all(galleryUrls.map(image => new Promise(async (resolve, reject) => {