diff --git a/server/crawlers/bringatrailer.crawler.js b/server/crawlers/bringatrailer.crawler.js new file mode 100644 index 0000000..838b78c --- /dev/null +++ b/server/crawlers/bringatrailer.crawler.js @@ -0,0 +1,152 @@ +const puppeteer = require('puppeteer'); +const genericVinParserFactory = require('../processors/generics/generic-vin-parser'); +const superagent = require('superagent'); +const {log} = require('clew-logger'); + +const selectors = { + listingInternalLink: "h3>a.typography-body1", + vinSpan: "h2>span.flex-1", + listingExternalLink: "#vehicle_gotoAuction>a", + totalPageIndicator: ".mx-2>span.typography-subtitle1:nth-child(3)" +} + +module.exports = { + cronString: '00 8 * * 0', + run: async function () { + + function getQueryBody(pageNum) { + return `page=${pageNum}&per_page=24&get_items=1&get_stats=0&base_filter%5Bkeyword_pages%5D%5B%5D=13387884&base_filter%5Bitems_type%5D=model&sort=td` + } + + + //get first page + const backendUrl = 'https://bringatrailer.com/wp-json/bringatrailer/1.0/data/listings-filter'; + const firstPageResponse = await superagent.post(backendUrl) + .send(getQueryBody(1)) + .set('Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8').end(); + const totalPages = firstPageResponse.body?.pages_total; + + + + // const startingPoint = baseUrl; + // const browser = await puppeteer.launch({ + // headless: true, + // args: ['--no-sandbox'] + // }); + // const page = await browser.newPage(); + // await page.setViewport({ + // width: 1200, + // height: 800 + // }); + // await page.setUserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"); + // log.info('Loading page...'); + // await page.goto(startingPoint, { timeout: 60000 }); + // log.info('page loaded'); + + // //get total page count + // const visiblePageNumbers = await Promise.all((await page.$$(selectors.totalPageIndicator)).map(async element => await page.evaluate(el => el.textContent, element))) + // const highestPage = Math.max(...visiblePageNumbers.map(num => parseInt(num))); + for(let pageNumber = 1;pageNumber < totalPages;pageNumber++) { + const pageContent = await superagent.post(backendUrl) + .send(getQueryBody(pageNumber)) + .set('Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8').end(); + + let cars = await this.filterCompletedLinks((pageContent.body?.items || []).map(c => c.url)).map(item => { + return { + vin: null, //I really don't feel like loading each link to pull the vin right now. Maybe later. + url: item + } + }); + log.info(cars); + cars.forEach(car => { + superagent.post(`${process.env.parentUrl}/lead/createFromCrawler`) + .send({ + url: car.url, + vinNumber: car.vin + }) + .set('authorization', `Basic ${process.env.crawlerToken}`) + .end((err, res) => { + if(err){ + log.error({ + message: 'Failed to send lead', + error: err + }); + } + }); + }) + } + + }, + processPage: async function(page) { + const candidateLeads = []; + const vinParser = genericVinParserFactory({ + vinElementSelector: selectors.vinSpan, + vinRegex: /(?A\d\w397\w\d{6})/i + }) + + //start with this page, pull all the carname elements for their links + let links = await page.$$(selectors.listingInternalLink); + links = await Promise.all(links.map(async element => baseUrl + (await page.evaluate(el => el.href, element)))); + + links = await this.filterCompletedLinks(links); + + + log.info(`Found ${links.length} unexplored links...`); + + + + for(let i = 0;i < links.length;i++){ + let link = links[i]; + const newTab = await page.browser().newPage(); + await newTab.setViewport({ + width: 1200, + height: 800 + }); + await newTab.setUserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"); + log.info('Loading page...'); + await newTab.goto(link, { timeout: 60000 }); + log.info({ + message: 'loaded new tab', + link + }); + const possibleVin = await vinParser(newTab); + if(possibleVin){ + + //get the external link + const externalAnchor = await newTab.$(selectors.listingExternalLink); + if(externalAnchor) { + //candidate found + + const externalLink = await page.evaluate(el => el.href, externalAnchor); + candidateLeads.push({ + vin: possibleVin, + url: externalLink + }); + } + + + } + await newTab.close(); + + } + return candidateLeads; + }, + filterCompletedLinks: function(links){ + return new Promise((resolve, reject) => { + superagent.post(`${process.env.parentUrl}/lead/crawler/filterUrls`) + .send({ + urls: links + }) + .set('authorization', `Basic ${process.env.crawlerToken}`) + .set('Accept', 'application/json') + .end((err, res) => { + if(err){ + console.error('Failed to filter urls', err); + return reject(err); + } + resolve(res.body?.urls); + }); + }) + + } +} \ No newline at end of file