diff --git a/server/crawlers/classic.processor.js b/server/crawlers/classic.processor.js new file mode 100644 index 0000000..dcd9c40 --- /dev/null +++ b/server/crawlers/classic.processor.js @@ -0,0 +1,134 @@ +const puppeteer = require('puppeteer'); +const genericVinParserFactory = require('../processors/generics/generic-vin-parser'); +const superagent = require('superagent'); +const {log} = require('clew-logger'); + +const selectors = { + listingInternalLink: "h3>a.typography-body1", + vinSpan: "h2>span.flex-1", + listingExternalLink: "#vehicle_gotoAuction>a", + totalPageIndicator: ".mx-2>span.typography-subtitle1:nth-child(3)" +} +const baseUrl = 'https://www.classic.com/m/amc/amx' + +module.exports = { + cronString: '00 6 * * 0', + run: async function () { + + + const startingPoint = baseUrl; + const browser = await puppeteer.launch({ + headless: true, + args: ['--no-sandbox'] + }); + const page = await browser.newPage(); + await page.setViewport({ + width: 1200, + height: 800 + }); + await page.setUserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"); + log.info('Loading page...'); + await page.goto(startingPoint, { timeout: 60000 }); + log.info('page loaded'); + + //get total page count + const visiblePageNumbers = await Promise.all((await page.$$(selectors.totalPageIndicator)).map(async element => await page.evaluate(el => el.textContent, element))) + const highestPage = Math.max(...visiblePageNumbers.map(num => parseInt(num))); + for(let pageNumber = 1;pageNumber < highestPage;pageNumber++) { + log.info(`loading page # ${pageNumber}`) + await page.goto(`${startingPoint}?page=${pageNumber}`, {timeout: 60000}); + const cars = await module.exports.processPage(page); + log.info(cars); + cars.forEach(car => { + superagent.post(`${process.env.parentUrl}/lead/createFromCrawler`) + .send({ + url: car.url, + vinNumber: car.vin + }) + .set('authorization', `Basic ${process.env.crawlerToken}`) + .end((err, res) => { + if(err){ + log.error({ + message: 'Failed to send lead', + error: err + }); + } + }); + }) + } + await browser.close(); + + }, + processPage: async function(page) { + const candidateLeads = []; + const vinParser = genericVinParserFactory({ + vinElementSelector: selectors.vinSpan, + vinRegex: /(?A\d\w397\w\d{6})/i + }) + + //start with this page, pull all the carname elements for their links + let links = await page.$$(selectors.listingInternalLink); + links = await Promise.all(links.map(async element => baseUrl + (await page.evaluate(el => el.href, element)))); + + links = await this.filterCompletedLinks(links); + + + log.info(`Found ${links.length} unexplored links...`); + + + + for(let i = 0;i < links.length;i++){ + let link = links[i]; + const newTab = await page.browser().newPage(); + await newTab.setViewport({ + width: 1200, + height: 800 + }); + await newTab.setUserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"); + log.info('Loading page...'); + await newTab.goto(link, { timeout: 60000 }); + log.info({ + message: 'loaded new tab', + link + }); + const possibleVin = await vinParser(newTab); + if(possibleVin){ + + //get the external link + const externalAnchor = await newTab.$(selectors.listingExternalLink); + if(externalAnchor) { + //candidate found + + const externalLink = await page.evaluate(el => el.href, externalAnchor); + candidateLeads.push({ + vin: possibleVin, + url: externalLink + }); + } + + + } + await newTab.close(); + + } + return candidateLeads; + }, + filterCompletedLinks: function(links){ + return new Promise((resolve, reject) => { + superagent.post(`${process.env.parentUrl}/lead/crawler/filterUrls`) + .send({ + urls: links + }) + .set('authorization', `Basic ${process.env.crawlerToken}`) + .set('Accept', 'application/json') + .end((err, res) => { + if(err){ + console.error('Failed to filter urls', err); + return reject(err); + } + resolve(res.body?.urls); + }); + }) + + } +} \ No newline at end of file