const puppeteer = require('puppeteer'); const genericVinParserFactory = require('../processors/generics/generic-vin-parser'); const superagent = require('superagent'); const {log} = require('clew-logger'); const selectors = { listingInternalLink: "h3>a.typography-body1", vinSpan: "h2>span.flex-1", listingExternalLink: "#vehicle_gotoAuction>a", totalPageIndicator: ".mx-2>span.typography-subtitle1:nth-child(3)" } module.exports = { cronString: '00 8 * * 0', run: async function () { function getQueryBody(pageNum) { return `page=${pageNum}&per_page=24&get_items=1&get_stats=0&base_filter%5Bkeyword_pages%5D%5B%5D=13387884&base_filter%5Bitems_type%5D=model&sort=td` } //get first page const backendUrl = 'https://bringatrailer.com/wp-json/bringatrailer/1.0/data/listings-filter'; const firstPageResponse = await superagent.post(backendUrl) .send(getQueryBody(1)) .set('Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8').end(); const totalPages = firstPageResponse.body?.pages_total; // const startingPoint = baseUrl; // const browser = await puppeteer.launch({ // headless: true, // args: ['--no-sandbox'] // }); // const page = await browser.newPage(); // await page.setViewport({ // width: 1200, // height: 800 // }); // await page.setUserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"); // log.info('Loading page...'); // await page.goto(startingPoint, { timeout: 60000 }); // log.info('page loaded'); // //get total page count // const visiblePageNumbers = await Promise.all((await page.$$(selectors.totalPageIndicator)).map(async element => await page.evaluate(el => el.textContent, element))) // const highestPage = Math.max(...visiblePageNumbers.map(num => parseInt(num))); for(let pageNumber = 1;pageNumber < totalPages;pageNumber++) { const pageContent = await superagent.post(backendUrl) .send(getQueryBody(pageNumber)) .set('Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8').end(); let cars = await this.filterCompletedLinks((pageContent.body?.items || []).map(c => c.url)).map(item => { return { vin: null, //I really don't feel like loading each link to pull the vin right now. Maybe later. url: item } }); log.info(cars); cars.forEach(car => { superagent.post(`${process.env.parentUrl}/lead/createFromCrawler`) .send({ url: car.url, vinNumber: car.vin }) .set('authorization', `Basic ${process.env.crawlerToken}`) .end((err, res) => { if(err){ log.error({ message: 'Failed to send lead', error: err }); } }); }) } }, processPage: async function(page) { const candidateLeads = []; const vinParser = genericVinParserFactory({ vinElementSelector: selectors.vinSpan, vinRegex: /(?A\d\w397\w\d{6})/i }) //start with this page, pull all the carname elements for their links let links = await page.$$(selectors.listingInternalLink); links = await Promise.all(links.map(async element => baseUrl + (await page.evaluate(el => el.href, element)))); links = await this.filterCompletedLinks(links); log.info(`Found ${links.length} unexplored links...`); for(let i = 0;i < links.length;i++){ let link = links[i]; const newTab = await page.browser().newPage(); await newTab.setViewport({ width: 1200, height: 800 }); await newTab.setUserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"); log.info('Loading page...'); await newTab.goto(link, { timeout: 60000 }); log.info({ message: 'loaded new tab', link }); const possibleVin = await vinParser(newTab); if(possibleVin){ //get the external link const externalAnchor = await newTab.$(selectors.listingExternalLink); if(externalAnchor) { //candidate found const externalLink = await page.evaluate(el => el.href, externalAnchor); candidateLeads.push({ vin: possibleVin, url: externalLink }); } } await newTab.close(); } return candidateLeads; }, filterCompletedLinks: function(links){ return new Promise((resolve, reject) => { superagent.post(`${process.env.parentUrl}/lead/crawler/filterUrls`) .send({ urls: links }) .set('authorization', `Basic ${process.env.crawlerToken}`) .set('Accept', 'application/json') .end((err, res) => { if(err){ console.error('Failed to filter urls', err); return reject(err); } resolve(res.body?.urls); }); }) } }