Added 1st crawler

crawler
Edward Peterson 3 years ago
parent 797ab657d8
commit 2f5cc56b5f

@ -0,0 +1,65 @@
const puppeteer = require('puppeteer');
const genericVinParserFactory = require('../processors/generics/generic-vin-parser');
module.exports = {
cronString: '46 * * * *',
run: async function () {
const startingPoint = 'https://topclassiccarsforsale.com/amc';
const browser = await puppeteer.launch({
headless: true
});
const page = await browser.newPage();
await page.setViewport({
width: 1200,
height: 800
});
await page.setUserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36");
console.log('Loading page...');
await page.goto(startingPoint, { timeout: 60000 });
console.log('page loaded');
//get total page count
const visiblePageNumbers = await Promise.all((await page.$$('span.pgs > a')).map(async element => await page.evaluate(el => el.textContent, element)))
const highestPage = Math.max(...visiblePageNumbers.map(num => parseInt(num)));
for(let pageNumber = 1;pageNumber < highestPage;pageNumber++) {
console.log('loading page #', pageNumber)
await page.goto(`${startingPoint}/page/${pageNumber}/`, {timeout: 60000});
const cars = await module.exports.processPage(page);
console.log(cars);
}
},
processPage: async function(page) {
const candidateLeads = [];
const vinParser = genericVinParserFactory({
vinElementSelector: `ul.fullinfo li`,
vinRegex: /(?<vin>A\d\w397\w\d{6})/i
})
//start with this page, pull all the carname elements for their links
let links = await page.$$('.carname');
links = await Promise.all(links.map(async element => await page.evaluate(el => el.href, element)));
for(let i = 0;i < links.length;i++){
let link = links[i];
const newTab = await page.browser().newPage();
await newTab.setViewport({
width: 1200,
height: 800
});
await newTab.setUserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36");
console.log('Loading page...');
await newTab.goto(link, { timeout: 60000 });
console.log('loaded new tab', link);
const possibleVin = await vinParser(newTab);
if(possibleVin){
//candidate found
candidateLeads.push({
vin: possibleVin,
url: link
});
}
await newTab.close();
}
return candidateLeads;
}
}

@ -6,7 +6,7 @@ const glob = require('glob');
const _ = require('lodash'); const _ = require('lodash');
const app = express(); const app = express();
var bodyParser = require('body-parser') var bodyParser = require('body-parser')
const CronJob = require('cron').CronJob;
const processors = []; const processors = [];
//load the processor files //load the processor files
@ -18,6 +18,17 @@ glob(path.resolve(__dirname, 'processors/*.processor.js'), (error, matches) => {
}) })
console.log(`${matches.length} processors loaded`); console.log(`${matches.length} processors loaded`);
}) })
//load the crawler files
glob(path.resolve(__dirname, 'crawlers/*.crawler.js'), (error, matches) => {
console.log(matches);
_.forEach(matches, file => {
const crawler = require(path.resolve(__dirname, file));
const cronJob = new CronJob(crawler.cronString, crawler.run);
cronJob.start();
})
console.log(`${matches.length} crawlers loaded`);
})
app.use(bodyParser.json()) app.use(bodyParser.json())
app.post('/convertGalleryToHar', async (req, res) => { app.post('/convertGalleryToHar', async (req, res) => {
const url = req.body.url; const url = req.body.url;

1588
package-lock.json generated

File diff suppressed because it is too large Load Diff

@ -10,6 +10,7 @@
"license": "ISC", "license": "ISC",
"dependencies": { "dependencies": {
"body-parser": "^1.20.1", "body-parser": "^1.20.1",
"cron": "^2.2.0",
"express": "^4.18.2", "express": "^4.18.2",
"glob": "^8.0.3", "glob": "^8.0.3",
"lodash": "^4.17.21", "lodash": "^4.17.21",

@ -2,7 +2,7 @@ const _ = require('lodash');
module.exports = function(config) { module.exports = function(config) {
return async function (page) { return async function (page) {
const vinSelector = config.vinElementSelector; const vinSelector = config.vinElementSelector;
const vinRegex = /(?<vin>A\d\w\d{3}\w\d{6})/i; const vinRegex = config.vinRegex || /(?<vin>A\d\w\d{3}\w\d{6})/i;
const possibleVinElements = await page.$$(vinSelector); const possibleVinElements = await page.$$(vinSelector);
const evaluatedVinElements = await Promise.all(possibleVinElements.map(async element => await page.evaluate(el => el.textContent, element))); const evaluatedVinElements = await Promise.all(possibleVinElements.map(async element => await page.evaluate(el => el.textContent, element)));
const correctElement = _.find(evaluatedVinElements, element => vinRegex.test(element)); const correctElement = _.find(evaluatedVinElements, element => vinRegex.test(element));

Loading…
Cancel
Save