const puppeteer = require('puppeteer'); const genericVinParserFactory = require('../processors/generics/generic-vin-parser'); const superagent = require('superagent'); const {log} = require('clew-logger'); const selectors = { listingInternalLink: "h3>a.typography-body1", vinSpan: "h2>span.flex-1", listingExternalLink: "#vehicle_gotoAuction>a", totalPageIndicator: ".mx-2>span.typography-subtitle1:nth-child(3)" } const baseUrl = 'https://www.classic.com/m/amc/amx' module.exports = { cronString: '00 6 * * 0', run: async function () { const startingPoint = baseUrl; const browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox'] }); const page = await browser.newPage(); await page.setViewport({ width: 1200, height: 800 }); await page.setUserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"); log.info('Loading page...'); await page.goto(startingPoint, { timeout: 60000 }); log.info('page loaded'); //get total page count const visiblePageNumbers = await Promise.all((await page.$$(selectors.totalPageIndicator)).map(async element => await page.evaluate(el => el.textContent, element))) const highestPage = Math.max(...visiblePageNumbers.map(num => parseInt(num))); for(let pageNumber = 1;pageNumber < highestPage;pageNumber++) { log.info(`loading page # ${pageNumber}`) await page.goto(`${startingPoint}?page=${pageNumber}`, {timeout: 60000}); const cars = await module.exports.processPage(page); log.info(cars); cars.forEach(car => { superagent.post(`${process.env.parentUrl}/lead/createFromCrawler`) .send({ url: car.url, vinNumber: car.vin }) .set('authorization', `Basic ${process.env.crawlerToken}`) .end((err, res) => { if(err){ log.error({ message: 'Failed to send lead', error: err }); } }); }) } await browser.close(); }, processPage: async function(page) { const candidateLeads = []; const vinParser = genericVinParserFactory({ vinElementSelector: selectors.vinSpan, vinRegex: /(?A\d\w397\w\d{6})/i }) //start with this page, pull all the carname elements for their links let links = await page.$$(selectors.listingInternalLink); links = await Promise.all(links.map(async element => baseUrl + (await page.evaluate(el => el.href, element)))); links = await this.filterCompletedLinks(links); log.info(`Found ${links.length} unexplored links...`); for(let i = 0;i < links.length;i++){ let link = links[i]; const newTab = await page.browser().newPage(); await newTab.setViewport({ width: 1200, height: 800 }); await newTab.setUserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"); log.info('Loading page...'); await newTab.goto(link, { timeout: 60000 }); log.info({ message: 'loaded new tab', link }); const possibleVin = await vinParser(newTab); if(possibleVin){ //get the external link const externalAnchor = await newTab.$(selectors.listingExternalLink); if(externalAnchor) { //candidate found const externalLink = await page.evaluate(el => el.href, externalAnchor); candidateLeads.push({ vin: possibleVin, url: externalLink }); } } await newTab.close(); } return candidateLeads; }, filterCompletedLinks: function(links){ return new Promise((resolve, reject) => { superagent.post(`${process.env.parentUrl}/lead/crawler/filterUrls`) .send({ urls: links }) .set('authorization', `Basic ${process.env.crawlerToken}`) .set('Accept', 'application/json') .end((err, res) => { if(err){ log.error({ message: 'Failed to filter urls', error: err }); return reject(err); } resolve(res.body?.urls); }); }) } }