Add 'server/crawlers/bringatrailer.crawler.js'
parent
528683f29d
commit
d7c080838b
@ -0,0 +1,152 @@
|
||||
const puppeteer = require('puppeteer');
|
||||
const genericVinParserFactory = require('../processors/generics/generic-vin-parser');
|
||||
const superagent = require('superagent');
|
||||
const {log} = require('clew-logger');
|
||||
|
||||
const selectors = {
|
||||
listingInternalLink: "h3>a.typography-body1",
|
||||
vinSpan: "h2>span.flex-1",
|
||||
listingExternalLink: "#vehicle_gotoAuction>a",
|
||||
totalPageIndicator: ".mx-2>span.typography-subtitle1:nth-child(3)"
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
cronString: '00 8 * * 0',
|
||||
run: async function () {
|
||||
|
||||
function getQueryBody(pageNum) {
|
||||
return `page=${pageNum}&per_page=24&get_items=1&get_stats=0&base_filter%5Bkeyword_pages%5D%5B%5D=13387884&base_filter%5Bitems_type%5D=model&sort=td`
|
||||
}
|
||||
|
||||
|
||||
//get first page
|
||||
const backendUrl = 'https://bringatrailer.com/wp-json/bringatrailer/1.0/data/listings-filter';
|
||||
const firstPageResponse = await superagent.post(backendUrl)
|
||||
.send(getQueryBody(1))
|
||||
.set('Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8').end();
|
||||
const totalPages = firstPageResponse.body?.pages_total;
|
||||
|
||||
|
||||
|
||||
// const startingPoint = baseUrl;
|
||||
// const browser = await puppeteer.launch({
|
||||
// headless: true,
|
||||
// args: ['--no-sandbox']
|
||||
// });
|
||||
// const page = await browser.newPage();
|
||||
// await page.setViewport({
|
||||
// width: 1200,
|
||||
// height: 800
|
||||
// });
|
||||
// await page.setUserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36");
|
||||
// log.info('Loading page...');
|
||||
// await page.goto(startingPoint, { timeout: 60000 });
|
||||
// log.info('page loaded');
|
||||
|
||||
// //get total page count
|
||||
// const visiblePageNumbers = await Promise.all((await page.$$(selectors.totalPageIndicator)).map(async element => await page.evaluate(el => el.textContent, element)))
|
||||
// const highestPage = Math.max(...visiblePageNumbers.map(num => parseInt(num)));
|
||||
for(let pageNumber = 1;pageNumber < totalPages;pageNumber++) {
|
||||
const pageContent = await superagent.post(backendUrl)
|
||||
.send(getQueryBody(pageNumber))
|
||||
.set('Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8').end();
|
||||
|
||||
let cars = await this.filterCompletedLinks((pageContent.body?.items || []).map(c => c.url)).map(item => {
|
||||
return {
|
||||
vin: null, //I really don't feel like loading each link to pull the vin right now. Maybe later.
|
||||
url: item
|
||||
}
|
||||
});
|
||||
log.info(cars);
|
||||
cars.forEach(car => {
|
||||
superagent.post(`${process.env.parentUrl}/lead/createFromCrawler`)
|
||||
.send({
|
||||
url: car.url,
|
||||
vinNumber: car.vin
|
||||
})
|
||||
.set('authorization', `Basic ${process.env.crawlerToken}`)
|
||||
.end((err, res) => {
|
||||
if(err){
|
||||
log.error({
|
||||
message: 'Failed to send lead',
|
||||
error: err
|
||||
});
|
||||
}
|
||||
});
|
||||
})
|
||||
}
|
||||
|
||||
},
|
||||
processPage: async function(page) {
|
||||
const candidateLeads = [];
|
||||
const vinParser = genericVinParserFactory({
|
||||
vinElementSelector: selectors.vinSpan,
|
||||
vinRegex: /(?<vin>A\d\w397\w\d{6})/i
|
||||
})
|
||||
|
||||
//start with this page, pull all the carname elements for their links
|
||||
let links = await page.$$(selectors.listingInternalLink);
|
||||
links = await Promise.all(links.map(async element => baseUrl + (await page.evaluate(el => el.href, element))));
|
||||
|
||||
links = await this.filterCompletedLinks(links);
|
||||
|
||||
|
||||
log.info(`Found ${links.length} unexplored links...`);
|
||||
|
||||
|
||||
|
||||
for(let i = 0;i < links.length;i++){
|
||||
let link = links[i];
|
||||
const newTab = await page.browser().newPage();
|
||||
await newTab.setViewport({
|
||||
width: 1200,
|
||||
height: 800
|
||||
});
|
||||
await newTab.setUserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36");
|
||||
log.info('Loading page...');
|
||||
await newTab.goto(link, { timeout: 60000 });
|
||||
log.info({
|
||||
message: 'loaded new tab',
|
||||
link
|
||||
});
|
||||
const possibleVin = await vinParser(newTab);
|
||||
if(possibleVin){
|
||||
|
||||
//get the external link
|
||||
const externalAnchor = await newTab.$(selectors.listingExternalLink);
|
||||
if(externalAnchor) {
|
||||
//candidate found
|
||||
|
||||
const externalLink = await page.evaluate(el => el.href, externalAnchor);
|
||||
candidateLeads.push({
|
||||
vin: possibleVin,
|
||||
url: externalLink
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
await newTab.close();
|
||||
|
||||
}
|
||||
return candidateLeads;
|
||||
},
|
||||
filterCompletedLinks: function(links){
|
||||
return new Promise((resolve, reject) => {
|
||||
superagent.post(`${process.env.parentUrl}/lead/crawler/filterUrls`)
|
||||
.send({
|
||||
urls: links
|
||||
})
|
||||
.set('authorization', `Basic ${process.env.crawlerToken}`)
|
||||
.set('Accept', 'application/json')
|
||||
.end((err, res) => {
|
||||
if(err){
|
||||
console.error('Failed to filter urls', err);
|
||||
return reject(err);
|
||||
}
|
||||
resolve(res.body?.urls);
|
||||
});
|
||||
})
|
||||
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue