implemented clew logger

master
Edward Peterson 2 years ago
parent d941164891
commit aecd69ebb9

@ -1,6 +1,7 @@
const puppeteer = require('puppeteer');
const genericVinParserFactory = require('../processors/generics/generic-vin-parser');
const superagent = require('superagent');
const {log} = require('clew-logger');
module.exports = {
cronString: '15 23 * * 0',
run: async function () {
@ -14,18 +15,18 @@ module.exports = {
height: 800
});
await page.setUserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36");
console.log('Loading page...');
log.info('Loading page...');
await page.goto(startingPoint, { timeout: 60000 });
console.log('page loaded');
log.info('page loaded');
//get total page count
const visiblePageNumbers = await Promise.all((await page.$$('span.pgs > a')).map(async element => await page.evaluate(el => el.textContent, element)))
const highestPage = Math.max(...visiblePageNumbers.map(num => parseInt(num)));
for(let pageNumber = 1;pageNumber < highestPage;pageNumber++) {
console.log('loading page #', pageNumber)
log.info(`loading page # ${pageNumber}`)
await page.goto(`${startingPoint}/page/${pageNumber}/`, {timeout: 60000});
const cars = await module.exports.processPage(page);
console.log(cars);
log.info(cars);
cars.forEach(car => {
superagent.post('http://localhost:3000/lead/createFromCrawler')
.send({
@ -56,7 +57,7 @@ module.exports = {
links = await this.filterCompletedLinks(links);
console.log(`Found ${links.length} unexplored links...`);
log.info(`Found ${links.length} unexplored links...`);
@ -68,9 +69,12 @@ module.exports = {
height: 800
});
await newTab.setUserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36");
console.log('Loading page...');
log.info('Loading page...');
await newTab.goto(link, { timeout: 60000 });
console.log('loaded new tab', link);
log.info({
message: 'loaded new tab',
link
});
const possibleVin = await vinParser(newTab);
if(possibleVin){
//candidate found

@ -7,43 +7,68 @@ const _ = require('lodash');
const app = express();
var bodyParser = require('body-parser')
const CronJob = require('cron').CronJob;
const clew = require('clew-logger');
clew.create({
prod: true,
prodHost: 'http://tonkatown.docker:3100',
appTag: 'listingExtractor',
})
const log = clew.log;
// console.log(log);
app.use(clew.context.attachContext('listingExtractor'));
app.use(clew.requestLogger);
// setTimeout(() => console.log(clew.log), 5000)
const processors = [];
//load the processor files
glob(path.resolve(__dirname, 'processors/*.processor.js'), (error, matches) => {
console.log(matches);
clew.context.assignRequestId(clew.context.init('listingExtractor'), () => {
_.forEach(matches, file => {
const processor = require(path.resolve(__dirname, file));
processors.push(processor);
})
console.log(`${matches.length} processors loaded`);
log.info(`${matches.length} processors loaded`);
}, {
process: 'processor loader'
})
})
//load the crawler files
glob(path.resolve(__dirname, 'crawlers/*.crawler.js'), (error, matches) => {
console.log(matches);
clew.context.assignRequestId(clew.context.init('listingExtractor'), () => {
_.forEach(matches, file => {
const crawler = require(path.resolve(__dirname, file));
const cronJob = new CronJob(crawler.cronString, crawler.run);
const cronJob = new CronJob(crawler.cronString, () => {
const ctx = clew.context.init('listingExtractor');
clew.context.assignRequestId(ctx, crawler.run, {
crawler: file
});
});
cronJob.start();
})
console.log(`${matches.length} crawlers loaded`);
log.info(`${matches.length} crawlers loaded`);
}, {
process: 'crawler loader'
})
})
app.use(bodyParser.json())
app.post('/convertGalleryToHar', async (req, res) => {
const url = req.body.url;
console.log(url);
log.info(url);
// get processor
const searchableHostname = (new URL(url)).hostname.replace(/^www\./i, '');
console.log('Searching for hostname:', searchableHostname)
log.info(`Searching for hostname: ${searchableHostname}`)
const processor = _.find(processors, (processor) => (searchableHostname) === (processor.baseUrl.replace(/^www\./i, '')));
if (!processor) {
return res.status(400).json({
message: 'Could not find processor for url'
})
}
console.log('Processor found', processor.baseUrl);
log.info(`Processor found ${processor.baseUrl}`);
try {
const payloads = await run(url, processor);
res.status(200).json({
@ -55,7 +80,7 @@ app.post('/convertGalleryToHar', async (req, res) => {
// payloads
})
} catch (error) {
console.log(error);
log.info({ message: error.message });
res.status(500).json(error);
}
@ -76,23 +101,23 @@ async function run(url, processor) {
height: 800
});
await page.setUserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36");
console.log('Loading page...');
log.info('Loading page...');
await page.goto(url, { timeout: 60000 });
console.log('page loaded');
console.log('attempting to parse fields');
log.info('page loaded');
log.info('attempting to parse fields');
const vin = await processor.parseVIN(page);
console.log('parsed VIN:', vin);
log.info(`parsed VIN: ${vin}`);
const mileage = await processor.parseMileage(page);
console.log('parsed Mileage:', mileage);
log.info(`parsed Mileage: ${mileage}`);
const galleryUrls = await processor.execute(page);
await page.close();
console.log('Done collecting URLS', galleryUrls.length);
log.info(`Done collecting URLS (${galleryUrls.length})`);
const payloads = await Promise.all(galleryUrls.map(image => new Promise(async (resolve, reject) => {
if (image.url) {
superagent.get(image.url).responseType('blob').timeout(30000).then(function (response) {
if (response.statusCode == 200) {
console.log('Resolving', image.url)
log.info(`Resolving: ${image.url}`)
return resolve({
response: {
content: {
@ -103,12 +128,12 @@ async function run(url, processor) {
}
});
} else {
console.log("Invalid status code", response.statusCode, 'for', image.url);
log.info(`Invalid status code ${response.statusCode} for ${image.url}`);
resolve({})
}
}).catch(error => {
console.error(error)
log.error(error.message)
resolve({})
});
} else if (image.base64) {
@ -125,7 +150,7 @@ async function run(url, processor) {
resolve({})
}
})))
console.log('URLS done downloading')
log.info('URLS done downloading')
await browser.close();
return {
vin,

1618
package-lock.json generated

File diff suppressed because it is too large Load Diff

@ -10,6 +10,7 @@
"license": "ISC",
"dependencies": {
"body-parser": "^1.20.1",
"clew-logger": "^1.0.8",
"cron": "^2.2.0",
"express": "^4.18.2",
"glob": "^8.0.3",

@ -14,7 +14,6 @@ module.exports = {
const thumbnailNodes = await page.$$(thumbnailSelector);
const sourcesFromThumbnails = await Promise.all(thumbnailNodes.map(async carouselItem => {
const src = await page.evaluate(el => el.getAttribute('src'), carouselItem);
// console.log(src);
return { url: convertThumbnailUrlToFullSize(src) };
}));
const sources = sourcesFromThumbnails;

@ -1,4 +1,6 @@
const genericVinParserFactory = require("./generics/generic-vin-parser");
const {log} = require('clew-logger');
module.exports = {
baseUrl: 'classiccarsbay.com',
async execute(page) {
@ -8,7 +10,7 @@ module.exports = {
const images = await page.$$(imageSelector);
const sources = await Promise.all(images.map(async carouselItem => {
const src = await page.evaluate(el => el.getAttribute('src'), carouselItem);
console.log(src);
log.info(src);
return { url: `https://${module.exports.baseUrl}${src}` };
}));
return sources;

@ -1,5 +1,6 @@
const _ = require('lodash');
const genericVinParserFactory = require("./generics/generic-vin-parser");
const {log} = require('clew-logger');
module.exports = {
baseUrl: 'davidsclassiccars.com',
@ -8,10 +9,9 @@ module.exports = {
await page.waitForSelector(pageLoadIndicator);
const imageSelector = '.carimage > img';
const images = await page.$$(imageSelector);
console.log(`Found ${images.length} images...`)
log.info(`Found ${images.length} images...`)
const sources = await Promise.all(images.map(async carouselItem => {
const src = await page.evaluate(el => el.getAttribute('src'), carouselItem);
// console.log(src);
return { url: this.baseUrl + src };
}));
return sources;

@ -1,5 +1,7 @@
const genericVinParserFactory = require("./generics/generic-vin-parser");
const {log} = require('clew-logger');
module.exports = {
baseUrl: 'ebay.com',
execute: async function (page) {
@ -12,7 +14,7 @@ module.exports = {
const sellerElement = await page.$(sellerSelector);
const seller = await page.evaluate(el => el.textContent, sellerElement);
console.log(`User is '${seller}'`);
log.info(`User is '${seller}'`);
if (seller === 'classicautomall') {
await page.waitForSelector(descriptionSelectr);
@ -28,26 +30,24 @@ module.exports = {
if (hasThumbnailGallery) {
const carouselItems = await page.$$(thumbnailGalleryItemSelector);
// console.log(carouselItems)
const sourcesFromThumbnailGallery = await Promise.all(carouselItems.map(async carouselItem => {
const src = await page.evaluate(el => el.getAttribute('src'), carouselItem);
// console.log(src);
return { url: convertThumbnailUrlToFullSize(src) };
}));
sources = sourcesFromThumbnailGallery;
}
console.log('Performing autoscroll')
log.info('Performing autoscroll')
await autoScroll(page); //auto scroll to trigger loading of description iframe
console.log('autoscroll complete');
log.info('autoscroll complete');
const descriptionIframe = await page.$('#desc_ifr');
console.log('has description iframe?', !!descriptionIframe);
log.info(`has description iframe? ${!!descriptionIframe}`);
const descriptionUrl = await page.evaluate(el => el.getAttribute('src'), descriptionIframe)
console.log('has description URL?', descriptionUrl);
log.info(`has description URL? ${descriptionUrl}`);
await page.goto(descriptionUrl);
//check the description if it has viewall button
const hasViewAllButton = !!(await page.$('.thumbnail-list-wrapper > a.cgg-btn:nth-child(6)'))
console.log('hasViewALl', hasViewAllButton)
log.info(`hasViewALl ${hasViewAllButton}`)
if (hasViewAllButton) {
const openFullGallerySelector = '.thumbnail-list-wrapper > a.cgg-btn:nth-child(6)';
const nextUrl = await page.evaluate(el => el.getAttribute('href'), await page.$(openFullGallerySelector));
@ -56,20 +56,18 @@ module.exports = {
const images = await page.$$('.photo');
sources = await Promise.all(images.map(async carouselItem => {
const src = await page.evaluate(el => el.getAttribute('src'), carouselItem);
// console.log(src);
return { url: src };
}));
return sources;
}
const hasSectionedImages = !!(await page.$('main.content'));
console.log(hasSectionedImages);
log.info(hasSectionedImages);
if(hasSectionedImages) {
const imageSelector = 'main.content > div > div > div > a > img';
const images = await page.$$(imageSelector);
console.log(images.length)
log.info(images.length)
sources = await Promise.all(images.map(async carouselItem => {
const src = await page.evaluate(el => el.getAttribute('src'), carouselItem);
// console.log(src);
return { url: src };
}));
return sources;
@ -100,7 +98,6 @@ async function classicautomall(page) {
const sources = await Promise.all(items.map(async carouselItem => {
const src = await page.evaluate(el => el.getAttribute('src'), carouselItem);
// console.log(src);
return { url: src };
}));
return sources;
@ -116,7 +113,6 @@ async function gatewayclassiccars(page) {
const images = await page.$$('.photo');
const sources = await Promise.all(images.map(async carouselItem => {
const src = await page.evaluate(el => el.getAttribute('src'), carouselItem);
// console.log(src);
return { url: src };
}));
return sources;
@ -130,7 +126,6 @@ async function autoScroll(page){
var scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
console.log(totalHeight);
if(totalHeight >= scrollHeight - window.innerHeight){
clearInterval(timer);
resolve();

@ -1,4 +1,6 @@
const uuid = require('uuid');
const {log} = require('clew-logger');
module.exports = function (processorConfig) {
downloadGallery = async (vin) => {
@ -7,7 +9,6 @@ module.exports = function (processorConfig) {
var imgWrap;
do {
imgWrap = document.elementFromPoint(100, 100);
console.log(imgWrap.classList);
await delay(50);
} while (imgWrap.classList.contains('pswp__img--placeholder'));
@ -78,7 +79,7 @@ module.exports = function (processorConfig) {
}
return async function (page) {
console.log('Running generic boostrap extractor')
log.info('Running generic boostrap extractor')
const allResultsSelector = processorConfig.pageLoadIndicator;
await page.waitForSelector(allResultsSelector);
@ -92,12 +93,12 @@ module.exports = function (processorConfig) {
console.error('Unable to grab VIN, falling back to UUID');
vin = uuid.v4();
});
console.log(vin);
log.info(vin);
const firstImageLinkSelector = processorConfig.carouselTrigger;
await page.click(firstImageLinkSelector);
await page.waitForSelector('.pswp__img')
console.log('Gallery is loaded, fetching URLS')
log.info('Gallery is loaded, fetching URLS')
const galleryUrls = await page.evaluate(downloadGallery, vin);

@ -1,17 +1,18 @@
const _ = require('lodash');
const {log} = require('clew-logger');
module.exports = function (config) {
return async function (page) {
const pageLoadIndicator = '.show-car-thumbs';
await page.waitForSelector(pageLoadIndicator);
const imageSelector = '.show-car-thumbs > a';
const images = await page.$$(imageSelector);
console.log(`Found ${images.length} images...`)
log.info(`Found ${images.length} images...`)
const sources = await Promise.all(images.map(async carouselItem => {
const src = await page.evaluate(el => el.getAttribute('data-original'), carouselItem);
if(!src) {
return {}
}
// console.log(src);
return { url: src };
}));
return sources;

@ -1,4 +1,6 @@
const genericVinParserFactory = require("./generics/generic-vin-parser");
const {log} = require('clew-logger');
module.exports = {
baseUrl: 'superstockamx.com',
async execute(page) {
@ -11,7 +13,7 @@ module.exports = {
if(src.includes('base64')) {
//base 64 pasted image
console.log('Found base64 image.')
log.info('Found base64 image.')
const regex = /^data:(?<contentType>[^;]+);base64,(?<data>.+)/g;
const matches = src.matchAll(regex);

@ -1,4 +1,6 @@
const genericVinParserFactory = require("./generics/generic-vin-parser");
const {log} = require('clew-logger');
module.exports = {
baseUrl: 'topclassiccarsforsale.com',
async execute(page) {
@ -8,7 +10,7 @@ module.exports = {
const images = await page.$$(imageSelector);
const sources = await Promise.all(images.map(async carouselItem => {
const src = await page.evaluate(el => el.getAttribute('src'), carouselItem);
console.log(src);
log.info(src);
return { url: this.baseUrl + src };
}));
return sources;

Loading…
Cancel
Save