const puppeteer = require('puppeteer') const superagent = require('superagent'); const path = require('path'); const express = require('express'); const glob = require('glob'); const _ = require('lodash'); const app = express(); var bodyParser = require('body-parser') const CronJob = require('cron').CronJob; const clew = require('clew-logger'); clew.create({ prod: true, prodHost: 'http://tonkatown.docker:3100', appTag: 'listingExtractor', }) const log = clew.log; // console.log(log); app.use(clew.context.attachContext('listingExtractor')); app.use(clew.requestLogger); // setTimeout(() => console.log(clew.log), 5000) const processors = []; //load the processor files glob(path.resolve(__dirname, 'processors/*.processor.js'), (error, matches) => { clew.context.assignRequestId(clew.context.init('listingExtractor'), () => { _.forEach(matches, file => { const processor = require(path.resolve(__dirname, file)); processors.push(processor); }) log.info(`${matches.length} processors loaded`); }, { process: 'processor loader' }) }) //load the crawler files glob(path.resolve(__dirname, 'crawlers/*.crawler.js'), (error, matches) => { clew.context.assignRequestId(clew.context.init('listingExtractor'), () => { _.forEach(matches, file => { const crawler = require(path.resolve(__dirname, file)); const cronJob = new CronJob(crawler.cronString, () => { const ctx = clew.context.init('listingExtractor'); clew.context.assignRequestId(ctx, crawler.run, { crawler: file }); }); cronJob.start(); }) log.info(`${matches.length} crawlers loaded`); }, { process: 'crawler loader' }) }) app.use(bodyParser.json()) app.post('/convertGalleryToHar', async (req, res) => { const url = req.body.url; log.info(url); // get processor const searchableHostname = (new URL(url)).hostname.replace(/^www\./i, ''); log.info(`Searching for hostname: ${searchableHostname}`) const processor = _.find(processors, (processor) => (searchableHostname) === (processor.baseUrl.replace(/^www\./i, ''))); if (!processor) { return res.status(400).json({ message: 'Could not find processor for url' }) } log.info(`Processor found ${processor.baseUrl}`); try { const payloads = await run(url, processor); res.status(200).json({ vin: payloads.vin, mileage: payloads.mileage, log: { entries: payloads.payloads } // payloads }) } catch (error) { log.info({ message: error.message }); res.status(500).json(error); } }) app.get('/supportedSites', async (req, res) => { res.status(200).json(processors.map(p => p.baseUrl)); }) app.listen(2667); async function run(url, processor) { const browser = await puppeteer.launch({ headless: true }); const page = await browser.newPage(); await page.setViewport({ width: 1200, height: 800 }); await page.setUserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"); log.info('Loading page...'); await page.goto(url, { timeout: 60000 }); log.info('page loaded'); log.info('attempting to parse fields'); const vin = await processor.parseVIN(page); log.info(`parsed VIN: ${vin}`); const mileage = await processor.parseMileage(page); log.info(`parsed Mileage: ${mileage}`); const galleryUrls = await processor.execute(page); await page.close(); log.info(`Done collecting URLS (${galleryUrls.length})`); const payloads = await Promise.all(galleryUrls.map(image => new Promise(async (resolve, reject) => { if (image.url) { superagent.get(image.url).responseType('blob').timeout(30000).then(function (response) { if (response.statusCode == 200) { log.info(`Resolving: ${image.url}`) return resolve({ response: { content: { mimeType: response.headers["content-type"], encoding: 'base64', text: response.body.toString('base64') } } }); } else { log.info(`Invalid status code ${response.statusCode} for ${image.url}`); resolve({}) } }).catch(error => { log.error(error.message) resolve({}) }); } else if (image.base64) { return resolve({ response: { content: { mimeType: image.contentType, encoding: 'base64', text: image.base64 } } }); } else { resolve({}) } }))) log.info('URLS done downloading') await browser.close(); return { vin, payloads, mileage }; } // run();