const puppeteer = require('puppeteer') const superagent = require('superagent'); const path = require('path'); const express = require('express'); const glob = require('glob'); const _ = require('lodash'); const app = express(); var bodyParser = require('body-parser') const CronJob = require('cron').CronJob; const processors = []; //load the processor files glob(path.resolve(__dirname, 'processors/*.processor.js'), (error, matches) => { console.log(matches); _.forEach(matches, file => { const processor = require(path.resolve(__dirname, file)); processors.push(processor); }) console.log(`${matches.length} processors loaded`); }) //load the crawler files glob(path.resolve(__dirname, 'crawlers/*.crawler.js'), (error, matches) => { console.log(matches); _.forEach(matches, file => { const crawler = require(path.resolve(__dirname, file)); const cronJob = new CronJob(crawler.cronString, crawler.run); cronJob.start(); }) console.log(`${matches.length} crawlers loaded`); }) app.use(bodyParser.json()) app.post('/convertGalleryToHar', async (req, res) => { const url = req.body.url; console.log(url); // get processor const searchableHostname = (new URL(url)).hostname.replace(/^www\./i, ''); console.log('Searching for hostname:', searchableHostname) const processor = _.find(processors, (processor) => (searchableHostname) === (processor.baseUrl.replace(/^www\./i, ''))); if (!processor) { return res.status(400).json({ message: 'Could not find processor for url' }) } console.log('Processor found', processor.baseUrl); try { const payloads = await run(url, processor); res.status(200).json({ vin: payloads.vin, mileage: payloads.mileage, log: { entries: payloads.payloads } // payloads }) } catch (error) { console.log(error); res.status(500).json(error); } }) app.get('/supportedSites', async (req, res) => { res.status(200).json(processors.map(p => p.baseUrl)); }) app.listen(2667); async function run(url, processor) { const browser = await puppeteer.launch({ headless: true }); const page = await browser.newPage(); await page.setViewport({ width: 1200, height: 800 }); await page.setUserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"); console.log('Loading page...'); await page.goto(url, { timeout: 60000 }); console.log('page loaded'); console.log('attempting to parse fields'); const vin = await processor.parseVIN(page); console.log('parsed VIN:', vin); const mileage = await processor.parseMileage(page); console.log('parsed Mileage:', mileage); const galleryUrls = await processor.execute(page); await page.close(); console.log('Done collecting URLS', galleryUrls.length); const payloads = await Promise.all(galleryUrls.map(image => new Promise(async (resolve, reject) => { if(image.url) { superagent.get(image.url).responseType('blob').then(function (response) { if (response.statusCode == 200) { console.log('Resolving', image.url) return resolve({ response: { content: { mimeType: response.headers["content-type"], encoding: 'base64', text: response.body.toString('base64') } } }); } else { console.log("Invalid status code", response.statusCode, 'for', image.url); resolve({}) } }).catch(error => { console.error(error) resolve({}) }); } else if (image.base64){ return resolve({ response: { content: { mimeType: response.headers["content-type"], encoding: 'base64', text: image.base64 } } }); } }))) console.log('URLS done downloading') await browser.close(); return { vin, payloads, mileage }; } // run();