|
|
|
|
@ -7,43 +7,68 @@ const _ = require('lodash');
|
|
|
|
|
const app = express();
|
|
|
|
|
var bodyParser = require('body-parser')
|
|
|
|
|
const CronJob = require('cron').CronJob;
|
|
|
|
|
const clew = require('clew-logger');
|
|
|
|
|
clew.create({
|
|
|
|
|
prod: true,
|
|
|
|
|
prodHost: 'http://tonkatown.docker:3100',
|
|
|
|
|
appTag: 'listingExtractor',
|
|
|
|
|
|
|
|
|
|
})
|
|
|
|
|
const log = clew.log;
|
|
|
|
|
// console.log(log);
|
|
|
|
|
app.use(clew.context.attachContext('listingExtractor'));
|
|
|
|
|
app.use(clew.requestLogger);
|
|
|
|
|
// setTimeout(() => console.log(clew.log), 5000)
|
|
|
|
|
|
|
|
|
|
const processors = [];
|
|
|
|
|
//load the processor files
|
|
|
|
|
glob(path.resolve(__dirname, 'processors/*.processor.js'), (error, matches) => {
|
|
|
|
|
console.log(matches);
|
|
|
|
|
clew.context.assignRequestId(clew.context.init('listingExtractor'), () => {
|
|
|
|
|
_.forEach(matches, file => {
|
|
|
|
|
const processor = require(path.resolve(__dirname, file));
|
|
|
|
|
processors.push(processor);
|
|
|
|
|
})
|
|
|
|
|
console.log(`${matches.length} processors loaded`);
|
|
|
|
|
log.info(`${matches.length} processors loaded`);
|
|
|
|
|
}, {
|
|
|
|
|
process: 'processor loader'
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
})
|
|
|
|
|
//load the crawler files
|
|
|
|
|
glob(path.resolve(__dirname, 'crawlers/*.crawler.js'), (error, matches) => {
|
|
|
|
|
console.log(matches);
|
|
|
|
|
clew.context.assignRequestId(clew.context.init('listingExtractor'), () => {
|
|
|
|
|
|
|
|
|
|
_.forEach(matches, file => {
|
|
|
|
|
const crawler = require(path.resolve(__dirname, file));
|
|
|
|
|
const cronJob = new CronJob(crawler.cronString, crawler.run);
|
|
|
|
|
const cronJob = new CronJob(crawler.cronString, () => {
|
|
|
|
|
const ctx = clew.context.init('listingExtractor');
|
|
|
|
|
clew.context.assignRequestId(ctx, crawler.run, {
|
|
|
|
|
crawler: file
|
|
|
|
|
});
|
|
|
|
|
});
|
|
|
|
|
cronJob.start();
|
|
|
|
|
|
|
|
|
|
})
|
|
|
|
|
console.log(`${matches.length} crawlers loaded`);
|
|
|
|
|
log.info(`${matches.length} crawlers loaded`);
|
|
|
|
|
}, {
|
|
|
|
|
process: 'crawler loader'
|
|
|
|
|
})
|
|
|
|
|
})
|
|
|
|
|
app.use(bodyParser.json())
|
|
|
|
|
app.post('/convertGalleryToHar', async (req, res) => {
|
|
|
|
|
const url = req.body.url;
|
|
|
|
|
console.log(url);
|
|
|
|
|
log.info(url);
|
|
|
|
|
|
|
|
|
|
// get processor
|
|
|
|
|
const searchableHostname = (new URL(url)).hostname.replace(/^www\./i, '');
|
|
|
|
|
console.log('Searching for hostname:', searchableHostname)
|
|
|
|
|
log.info(`Searching for hostname: ${searchableHostname}`)
|
|
|
|
|
const processor = _.find(processors, (processor) => (searchableHostname) === (processor.baseUrl.replace(/^www\./i, '')));
|
|
|
|
|
if (!processor) {
|
|
|
|
|
return res.status(400).json({
|
|
|
|
|
message: 'Could not find processor for url'
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
console.log('Processor found', processor.baseUrl);
|
|
|
|
|
log.info(`Processor found ${processor.baseUrl}`);
|
|
|
|
|
try {
|
|
|
|
|
const payloads = await run(url, processor);
|
|
|
|
|
res.status(200).json({
|
|
|
|
|
@ -55,7 +80,7 @@ app.post('/convertGalleryToHar', async (req, res) => {
|
|
|
|
|
// payloads
|
|
|
|
|
})
|
|
|
|
|
} catch (error) {
|
|
|
|
|
console.log(error);
|
|
|
|
|
log.info({ message: error.message });
|
|
|
|
|
res.status(500).json(error);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@ -76,23 +101,23 @@ async function run(url, processor) {
|
|
|
|
|
height: 800
|
|
|
|
|
});
|
|
|
|
|
await page.setUserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36");
|
|
|
|
|
console.log('Loading page...');
|
|
|
|
|
log.info('Loading page...');
|
|
|
|
|
await page.goto(url, { timeout: 60000 });
|
|
|
|
|
console.log('page loaded');
|
|
|
|
|
console.log('attempting to parse fields');
|
|
|
|
|
log.info('page loaded');
|
|
|
|
|
log.info('attempting to parse fields');
|
|
|
|
|
const vin = await processor.parseVIN(page);
|
|
|
|
|
console.log('parsed VIN:', vin);
|
|
|
|
|
log.info(`parsed VIN: ${vin}`);
|
|
|
|
|
const mileage = await processor.parseMileage(page);
|
|
|
|
|
console.log('parsed Mileage:', mileage);
|
|
|
|
|
log.info(`parsed Mileage: ${mileage}`);
|
|
|
|
|
const galleryUrls = await processor.execute(page);
|
|
|
|
|
await page.close();
|
|
|
|
|
|
|
|
|
|
console.log('Done collecting URLS', galleryUrls.length);
|
|
|
|
|
log.info(`Done collecting URLS (${galleryUrls.length})`);
|
|
|
|
|
const payloads = await Promise.all(galleryUrls.map(image => new Promise(async (resolve, reject) => {
|
|
|
|
|
if(image.url) {
|
|
|
|
|
if (image.url) {
|
|
|
|
|
superagent.get(image.url).responseType('blob').timeout(30000).then(function (response) {
|
|
|
|
|
if (response.statusCode == 200) {
|
|
|
|
|
console.log('Resolving', image.url)
|
|
|
|
|
log.info(`Resolving: ${image.url}`)
|
|
|
|
|
return resolve({
|
|
|
|
|
response: {
|
|
|
|
|
content: {
|
|
|
|
|
@ -103,15 +128,15 @@ async function run(url, processor) {
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
} else {
|
|
|
|
|
console.log("Invalid status code", response.statusCode, 'for', image.url);
|
|
|
|
|
log.info(`Invalid status code ${response.statusCode} for ${image.url}`);
|
|
|
|
|
resolve({})
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
}).catch(error => {
|
|
|
|
|
console.error(error)
|
|
|
|
|
log.error(error.message)
|
|
|
|
|
resolve({})
|
|
|
|
|
});
|
|
|
|
|
} else if (image.base64){
|
|
|
|
|
} else if (image.base64) {
|
|
|
|
|
return resolve({
|
|
|
|
|
response: {
|
|
|
|
|
content: {
|
|
|
|
|
@ -125,7 +150,7 @@ async function run(url, processor) {
|
|
|
|
|
resolve({})
|
|
|
|
|
}
|
|
|
|
|
})))
|
|
|
|
|
console.log('URLS done downloading')
|
|
|
|
|
log.info('URLS done downloading')
|
|
|
|
|
await browser.close();
|
|
|
|
|
return {
|
|
|
|
|
vin,
|
|
|
|
|
|