const puppeteer = require('puppeteer'); const genericVinParserFactory = require('../processors/generics/generic-vin-parser'); const superagent = require('superagent'); module.exports = { cronString: '15 23 * * 0', run: async function () { const startingPoint = 'https://topclassiccarsforsale.com/amc'; const browser = await puppeteer.launch({ headless: true }); const page = await browser.newPage(); await page.setViewport({ width: 1200, height: 800 }); await page.setUserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"); console.log('Loading page...'); await page.goto(startingPoint, { timeout: 60000 }); console.log('page loaded'); //get total page count const visiblePageNumbers = await Promise.all((await page.$$('span.pgs > a')).map(async element => await page.evaluate(el => el.textContent, element))) const highestPage = Math.max(...visiblePageNumbers.map(num => parseInt(num))); for(let pageNumber = 1;pageNumber < highestPage;pageNumber++) { console.log('loading page #', pageNumber) await page.goto(`${startingPoint}/page/${pageNumber}/`, {timeout: 60000}); const cars = await module.exports.processPage(page); console.log(cars); cars.forEach(car => { superagent.post('http://localhost:3000/lead/createFromCrawler') .send({ url: car.url, vinNumber: car.vin }) .set('authorization', `Basic ${process.env.crawlerToken}`) .end((err, res) => { if(err){ console.error('Failed to send lead', err); } }); }) } }, processPage: async function(page) { const candidateLeads = []; const vinParser = genericVinParserFactory({ vinElementSelector: `ul.fullinfo li`, vinRegex: /(?A\d\w397\w\d{6})/i }) //start with this page, pull all the carname elements for their links let links = await page.$$('.carname'); links = await Promise.all(links.map(async element => await page.evaluate(el => el.href, element))); links = await this.filterCompletedLinks(links); console.log(`Found ${links.length} unexplored links...`); for(let i = 0;i < links.length;i++){ let link = links[i]; const newTab = await page.browser().newPage(); await newTab.setViewport({ width: 1200, height: 800 }); await newTab.setUserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"); console.log('Loading page...'); await newTab.goto(link, { timeout: 60000 }); console.log('loaded new tab', link); const possibleVin = await vinParser(newTab); if(possibleVin){ //candidate found candidateLeads.push({ vin: possibleVin, url: link }); } await newTab.close(); } return candidateLeads; }, filterCompletedLinks: function(links){ return new Promise((resolve, reject) => { superagent.post('http://localhost:3000/lead/crawler/filterUrls') .send({ urls: links }) .set('authorization', `Basic ${process.env.crawlerToken}`) .set('Accept', 'application/json') .end((err, res) => { if(err){ console.error('Failed to filter urls', err); return reject(err); } resolve(res.body?.urls); }); }) } }