You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

162 lines
4.6 KiB
JavaScript

const puppeteer = require('puppeteer')
const superagent = require('superagent');
const path = require('path');
const express = require('express');
const glob = require('glob');
const _ = require('lodash');
const app = express();
var bodyParser = require('body-parser')
const CronJob = require('cron').CronJob;
const clew = require('clew-logger');
clew.create({
prod: true,
prodHost: 'http://tonkatown.docker:3100',
appTag: 'listingExtractor',
})
const log = clew.log;
// console.log(log);
app.use(clew.context.attachContext('listingExtractor'));
app.use(clew.requestLogger);
// setTimeout(() => console.log(clew.log), 5000)
const processors = [];
//load the processor files
glob(path.resolve(__dirname, 'processors/*.processor.js'), (error, matches) => {
clew.context.assignRequestId(clew.context.init('listingExtractor'), () => {
_.forEach(matches, file => {
const processor = require(path.resolve(__dirname, file));
processors.push(processor);
})
log.info(`${matches.length} processors loaded`);
}, {
process: 'processor loader'
})
})
//load the crawler files
glob(path.resolve(__dirname, 'crawlers/*.crawler.js'), (error, matches) => {
clew.context.assignRequestId(clew.context.init('listingExtractor'), () => {
_.forEach(matches, file => {
const crawler = require(path.resolve(__dirname, file));
const cronJob = new CronJob(crawler.cronString, () => {
const ctx = clew.context.init('listingExtractor');
clew.context.assignRequestId(ctx, crawler.run, {
crawler: file
});
});
cronJob.start();
})
log.info(`${matches.length} crawlers loaded`);
}, {
process: 'crawler loader'
})
})
app.use(bodyParser.json())
app.post('/convertGalleryToHar', async (req, res) => {
const url = req.body.url;
log.info(url);
// get processor
const searchableHostname = (new URL(url)).hostname.replace(/^www\./i, '');
log.info(`Searching for hostname: ${searchableHostname}`)
const processor = _.find(processors, (processor) => (searchableHostname) === (processor.baseUrl.replace(/^www\./i, '')));
if (!processor) {
return res.status(400).json({
message: 'Could not find processor for url'
})
}
log.info(`Processor found ${processor.baseUrl}`);
try {
const payloads = await run(url, processor);
res.status(200).json({
vin: payloads.vin,
mileage: payloads.mileage,
log: {
entries: payloads.payloads
}
// payloads
})
} catch (error) {
log.info({ message: error.message });
res.status(500).json(error);
}
})
app.get('/supportedSites', async (req, res) => {
res.status(200).json(processors.map(p => p.baseUrl));
})
app.listen(2667);
async function run(url, processor) {
const browser = await puppeteer.launch({
headless: true
});
const page = await browser.newPage();
await page.setViewport({
width: 1200,
height: 800
});
await page.setUserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36");
log.info('Loading page...');
await page.goto(url, { timeout: 60000 });
log.info('page loaded');
log.info('attempting to parse fields');
const vin = await processor.parseVIN(page);
log.info(`parsed VIN: ${vin}`);
const mileage = await processor.parseMileage(page);
log.info(`parsed Mileage: ${mileage}`);
const galleryUrls = await processor.execute(page);
await page.close();
log.info(`Done collecting URLS (${galleryUrls.length})`);
const payloads = await Promise.all(galleryUrls.map(image => new Promise(async (resolve, reject) => {
if (image.url) {
superagent.get(image.url).responseType('blob').timeout(30000).then(function (response) {
if (response.statusCode == 200) {
log.info(`Resolving: ${image.url}`)
return resolve({
response: {
content: {
mimeType: response.headers["content-type"],
encoding: 'base64',
text: response.body.toString('base64')
}
}
});
} else {
log.info(`Invalid status code ${response.statusCode} for ${image.url}`);
resolve({})
}
}).catch(error => {
log.error(error.message)
resolve({})
});
} else if (image.base64) {
return resolve({
response: {
content: {
mimeType: image.contentType,
encoding: 'base64',
text: image.base64
}
}
});
} else {
resolve({})
}
})))
log.info('URLS done downloading')
await browser.close();
return {
vin,
payloads,
mileage
};
}
// run();