implemented clew logger

master
Edward Peterson 2 years ago
parent d941164891
commit aecd69ebb9

@ -1,6 +1,7 @@
const puppeteer = require('puppeteer'); const puppeteer = require('puppeteer');
const genericVinParserFactory = require('../processors/generics/generic-vin-parser'); const genericVinParserFactory = require('../processors/generics/generic-vin-parser');
const superagent = require('superagent'); const superagent = require('superagent');
const {log} = require('clew-logger');
module.exports = { module.exports = {
cronString: '15 23 * * 0', cronString: '15 23 * * 0',
run: async function () { run: async function () {
@ -14,18 +15,18 @@ module.exports = {
height: 800 height: 800
}); });
await page.setUserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"); await page.setUserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36");
console.log('Loading page...'); log.info('Loading page...');
await page.goto(startingPoint, { timeout: 60000 }); await page.goto(startingPoint, { timeout: 60000 });
console.log('page loaded'); log.info('page loaded');
//get total page count //get total page count
const visiblePageNumbers = await Promise.all((await page.$$('span.pgs > a')).map(async element => await page.evaluate(el => el.textContent, element))) const visiblePageNumbers = await Promise.all((await page.$$('span.pgs > a')).map(async element => await page.evaluate(el => el.textContent, element)))
const highestPage = Math.max(...visiblePageNumbers.map(num => parseInt(num))); const highestPage = Math.max(...visiblePageNumbers.map(num => parseInt(num)));
for(let pageNumber = 1;pageNumber < highestPage;pageNumber++) { for(let pageNumber = 1;pageNumber < highestPage;pageNumber++) {
console.log('loading page #', pageNumber) log.info(`loading page # ${pageNumber}`)
await page.goto(`${startingPoint}/page/${pageNumber}/`, {timeout: 60000}); await page.goto(`${startingPoint}/page/${pageNumber}/`, {timeout: 60000});
const cars = await module.exports.processPage(page); const cars = await module.exports.processPage(page);
console.log(cars); log.info(cars);
cars.forEach(car => { cars.forEach(car => {
superagent.post('http://localhost:3000/lead/createFromCrawler') superagent.post('http://localhost:3000/lead/createFromCrawler')
.send({ .send({
@ -56,7 +57,7 @@ module.exports = {
links = await this.filterCompletedLinks(links); links = await this.filterCompletedLinks(links);
console.log(`Found ${links.length} unexplored links...`); log.info(`Found ${links.length} unexplored links...`);
@ -68,9 +69,12 @@ module.exports = {
height: 800 height: 800
}); });
await newTab.setUserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"); await newTab.setUserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36");
console.log('Loading page...'); log.info('Loading page...');
await newTab.goto(link, { timeout: 60000 }); await newTab.goto(link, { timeout: 60000 });
console.log('loaded new tab', link); log.info({
message: 'loaded new tab',
link
});
const possibleVin = await vinParser(newTab); const possibleVin = await vinParser(newTab);
if(possibleVin){ if(possibleVin){
//candidate found //candidate found

@ -7,43 +7,68 @@ const _ = require('lodash');
const app = express(); const app = express();
var bodyParser = require('body-parser') var bodyParser = require('body-parser')
const CronJob = require('cron').CronJob; const CronJob = require('cron').CronJob;
const clew = require('clew-logger');
clew.create({
prod: true,
prodHost: 'http://tonkatown.docker:3100',
appTag: 'listingExtractor',
})
const log = clew.log;
// console.log(log);
app.use(clew.context.attachContext('listingExtractor'));
app.use(clew.requestLogger);
// setTimeout(() => console.log(clew.log), 5000)
const processors = []; const processors = [];
//load the processor files //load the processor files
glob(path.resolve(__dirname, 'processors/*.processor.js'), (error, matches) => { glob(path.resolve(__dirname, 'processors/*.processor.js'), (error, matches) => {
console.log(matches); clew.context.assignRequestId(clew.context.init('listingExtractor'), () => {
_.forEach(matches, file => { _.forEach(matches, file => {
const processor = require(path.resolve(__dirname, file)); const processor = require(path.resolve(__dirname, file));
processors.push(processor); processors.push(processor);
})
log.info(`${matches.length} processors loaded`);
}, {
process: 'processor loader'
}) })
console.log(`${matches.length} processors loaded`);
}) })
//load the crawler files //load the crawler files
glob(path.resolve(__dirname, 'crawlers/*.crawler.js'), (error, matches) => { glob(path.resolve(__dirname, 'crawlers/*.crawler.js'), (error, matches) => {
console.log(matches); clew.context.assignRequestId(clew.context.init('listingExtractor'), () => {
_.forEach(matches, file => {
const crawler = require(path.resolve(__dirname, file));
const cronJob = new CronJob(crawler.cronString, crawler.run);
cronJob.start();
_.forEach(matches, file => {
const crawler = require(path.resolve(__dirname, file));
const cronJob = new CronJob(crawler.cronString, () => {
const ctx = clew.context.init('listingExtractor');
clew.context.assignRequestId(ctx, crawler.run, {
crawler: file
});
});
cronJob.start();
})
log.info(`${matches.length} crawlers loaded`);
}, {
process: 'crawler loader'
}) })
console.log(`${matches.length} crawlers loaded`);
}) })
app.use(bodyParser.json()) app.use(bodyParser.json())
app.post('/convertGalleryToHar', async (req, res) => { app.post('/convertGalleryToHar', async (req, res) => {
const url = req.body.url; const url = req.body.url;
console.log(url); log.info(url);
// get processor // get processor
const searchableHostname = (new URL(url)).hostname.replace(/^www\./i, ''); const searchableHostname = (new URL(url)).hostname.replace(/^www\./i, '');
console.log('Searching for hostname:', searchableHostname) log.info(`Searching for hostname: ${searchableHostname}`)
const processor = _.find(processors, (processor) => (searchableHostname) === (processor.baseUrl.replace(/^www\./i, ''))); const processor = _.find(processors, (processor) => (searchableHostname) === (processor.baseUrl.replace(/^www\./i, '')));
if (!processor) { if (!processor) {
return res.status(400).json({ return res.status(400).json({
message: 'Could not find processor for url' message: 'Could not find processor for url'
}) })
} }
console.log('Processor found', processor.baseUrl); log.info(`Processor found ${processor.baseUrl}`);
try { try {
const payloads = await run(url, processor); const payloads = await run(url, processor);
res.status(200).json({ res.status(200).json({
@ -55,7 +80,7 @@ app.post('/convertGalleryToHar', async (req, res) => {
// payloads // payloads
}) })
} catch (error) { } catch (error) {
console.log(error); log.info({ message: error.message });
res.status(500).json(error); res.status(500).json(error);
} }
@ -76,23 +101,23 @@ async function run(url, processor) {
height: 800 height: 800
}); });
await page.setUserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"); await page.setUserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36");
console.log('Loading page...'); log.info('Loading page...');
await page.goto(url, { timeout: 60000 }); await page.goto(url, { timeout: 60000 });
console.log('page loaded'); log.info('page loaded');
console.log('attempting to parse fields'); log.info('attempting to parse fields');
const vin = await processor.parseVIN(page); const vin = await processor.parseVIN(page);
console.log('parsed VIN:', vin); log.info(`parsed VIN: ${vin}`);
const mileage = await processor.parseMileage(page); const mileage = await processor.parseMileage(page);
console.log('parsed Mileage:', mileage); log.info(`parsed Mileage: ${mileage}`);
const galleryUrls = await processor.execute(page); const galleryUrls = await processor.execute(page);
await page.close(); await page.close();
console.log('Done collecting URLS', galleryUrls.length); log.info(`Done collecting URLS (${galleryUrls.length})`);
const payloads = await Promise.all(galleryUrls.map(image => new Promise(async (resolve, reject) => { const payloads = await Promise.all(galleryUrls.map(image => new Promise(async (resolve, reject) => {
if(image.url) { if (image.url) {
superagent.get(image.url).responseType('blob').timeout(30000).then(function (response) { superagent.get(image.url).responseType('blob').timeout(30000).then(function (response) {
if (response.statusCode == 200) { if (response.statusCode == 200) {
console.log('Resolving', image.url) log.info(`Resolving: ${image.url}`)
return resolve({ return resolve({
response: { response: {
content: { content: {
@ -103,29 +128,29 @@ async function run(url, processor) {
} }
}); });
} else { } else {
console.log("Invalid status code", response.statusCode, 'for', image.url); log.info(`Invalid status code ${response.statusCode} for ${image.url}`);
resolve({}) resolve({})
} }
}).catch(error => { }).catch(error => {
console.error(error) log.error(error.message)
resolve({}) resolve({})
}); });
} else if (image.base64){ } else if (image.base64) {
return resolve({ return resolve({
response: { response: {
content: { content: {
mimeType: image.contentType, mimeType: image.contentType,
encoding: 'base64', encoding: 'base64',
text: image.base64 text: image.base64
} }
} }
}); });
} else { } else {
resolve({}) resolve({})
} }
}))) })))
console.log('URLS done downloading') log.info('URLS done downloading')
await browser.close(); await browser.close();
return { return {
vin, vin,

1734
package-lock.json generated

File diff suppressed because it is too large Load Diff

@ -10,6 +10,7 @@
"license": "ISC", "license": "ISC",
"dependencies": { "dependencies": {
"body-parser": "^1.20.1", "body-parser": "^1.20.1",
"clew-logger": "^1.0.8",
"cron": "^2.2.0", "cron": "^2.2.0",
"express": "^4.18.2", "express": "^4.18.2",
"glob": "^8.0.3", "glob": "^8.0.3",

@ -14,7 +14,6 @@ module.exports = {
const thumbnailNodes = await page.$$(thumbnailSelector); const thumbnailNodes = await page.$$(thumbnailSelector);
const sourcesFromThumbnails = await Promise.all(thumbnailNodes.map(async carouselItem => { const sourcesFromThumbnails = await Promise.all(thumbnailNodes.map(async carouselItem => {
const src = await page.evaluate(el => el.getAttribute('src'), carouselItem); const src = await page.evaluate(el => el.getAttribute('src'), carouselItem);
// console.log(src);
return { url: convertThumbnailUrlToFullSize(src) }; return { url: convertThumbnailUrlToFullSize(src) };
})); }));
const sources = sourcesFromThumbnails; const sources = sourcesFromThumbnails;

@ -1,4 +1,6 @@
const genericVinParserFactory = require("./generics/generic-vin-parser"); const genericVinParserFactory = require("./generics/generic-vin-parser");
const {log} = require('clew-logger');
module.exports = { module.exports = {
baseUrl: 'classiccarsbay.com', baseUrl: 'classiccarsbay.com',
async execute(page) { async execute(page) {
@ -8,7 +10,7 @@ module.exports = {
const images = await page.$$(imageSelector); const images = await page.$$(imageSelector);
const sources = await Promise.all(images.map(async carouselItem => { const sources = await Promise.all(images.map(async carouselItem => {
const src = await page.evaluate(el => el.getAttribute('src'), carouselItem); const src = await page.evaluate(el => el.getAttribute('src'), carouselItem);
console.log(src); log.info(src);
return { url: `https://${module.exports.baseUrl}${src}` }; return { url: `https://${module.exports.baseUrl}${src}` };
})); }));
return sources; return sources;

@ -1,5 +1,6 @@
const _ = require('lodash'); const _ = require('lodash');
const genericVinParserFactory = require("./generics/generic-vin-parser"); const genericVinParserFactory = require("./generics/generic-vin-parser");
const {log} = require('clew-logger');
module.exports = { module.exports = {
baseUrl: 'davidsclassiccars.com', baseUrl: 'davidsclassiccars.com',
@ -8,10 +9,9 @@ module.exports = {
await page.waitForSelector(pageLoadIndicator); await page.waitForSelector(pageLoadIndicator);
const imageSelector = '.carimage > img'; const imageSelector = '.carimage > img';
const images = await page.$$(imageSelector); const images = await page.$$(imageSelector);
console.log(`Found ${images.length} images...`) log.info(`Found ${images.length} images...`)
const sources = await Promise.all(images.map(async carouselItem => { const sources = await Promise.all(images.map(async carouselItem => {
const src = await page.evaluate(el => el.getAttribute('src'), carouselItem); const src = await page.evaluate(el => el.getAttribute('src'), carouselItem);
// console.log(src);
return { url: this.baseUrl + src }; return { url: this.baseUrl + src };
})); }));
return sources; return sources;

@ -1,5 +1,7 @@
const genericVinParserFactory = require("./generics/generic-vin-parser"); const genericVinParserFactory = require("./generics/generic-vin-parser");
const {log} = require('clew-logger');
module.exports = { module.exports = {
baseUrl: 'ebay.com', baseUrl: 'ebay.com',
execute: async function (page) { execute: async function (page) {
@ -12,7 +14,7 @@ module.exports = {
const sellerElement = await page.$(sellerSelector); const sellerElement = await page.$(sellerSelector);
const seller = await page.evaluate(el => el.textContent, sellerElement); const seller = await page.evaluate(el => el.textContent, sellerElement);
console.log(`User is '${seller}'`); log.info(`User is '${seller}'`);
if (seller === 'classicautomall') { if (seller === 'classicautomall') {
await page.waitForSelector(descriptionSelectr); await page.waitForSelector(descriptionSelectr);
@ -28,26 +30,24 @@ module.exports = {
if (hasThumbnailGallery) { if (hasThumbnailGallery) {
const carouselItems = await page.$$(thumbnailGalleryItemSelector); const carouselItems = await page.$$(thumbnailGalleryItemSelector);
// console.log(carouselItems)
const sourcesFromThumbnailGallery = await Promise.all(carouselItems.map(async carouselItem => { const sourcesFromThumbnailGallery = await Promise.all(carouselItems.map(async carouselItem => {
const src = await page.evaluate(el => el.getAttribute('src'), carouselItem); const src = await page.evaluate(el => el.getAttribute('src'), carouselItem);
// console.log(src);
return { url: convertThumbnailUrlToFullSize(src) }; return { url: convertThumbnailUrlToFullSize(src) };
})); }));
sources = sourcesFromThumbnailGallery; sources = sourcesFromThumbnailGallery;
} }
console.log('Performing autoscroll') log.info('Performing autoscroll')
await autoScroll(page); //auto scroll to trigger loading of description iframe await autoScroll(page); //auto scroll to trigger loading of description iframe
console.log('autoscroll complete'); log.info('autoscroll complete');
const descriptionIframe = await page.$('#desc_ifr'); const descriptionIframe = await page.$('#desc_ifr');
console.log('has description iframe?', !!descriptionIframe); log.info(`has description iframe? ${!!descriptionIframe}`);
const descriptionUrl = await page.evaluate(el => el.getAttribute('src'), descriptionIframe) const descriptionUrl = await page.evaluate(el => el.getAttribute('src'), descriptionIframe)
console.log('has description URL?', descriptionUrl); log.info(`has description URL? ${descriptionUrl}`);
await page.goto(descriptionUrl); await page.goto(descriptionUrl);
//check the description if it has viewall button //check the description if it has viewall button
const hasViewAllButton = !!(await page.$('.thumbnail-list-wrapper > a.cgg-btn:nth-child(6)')) const hasViewAllButton = !!(await page.$('.thumbnail-list-wrapper > a.cgg-btn:nth-child(6)'))
console.log('hasViewALl', hasViewAllButton) log.info(`hasViewALl ${hasViewAllButton}`)
if (hasViewAllButton) { if (hasViewAllButton) {
const openFullGallerySelector = '.thumbnail-list-wrapper > a.cgg-btn:nth-child(6)'; const openFullGallerySelector = '.thumbnail-list-wrapper > a.cgg-btn:nth-child(6)';
const nextUrl = await page.evaluate(el => el.getAttribute('href'), await page.$(openFullGallerySelector)); const nextUrl = await page.evaluate(el => el.getAttribute('href'), await page.$(openFullGallerySelector));
@ -56,20 +56,18 @@ module.exports = {
const images = await page.$$('.photo'); const images = await page.$$('.photo');
sources = await Promise.all(images.map(async carouselItem => { sources = await Promise.all(images.map(async carouselItem => {
const src = await page.evaluate(el => el.getAttribute('src'), carouselItem); const src = await page.evaluate(el => el.getAttribute('src'), carouselItem);
// console.log(src);
return { url: src }; return { url: src };
})); }));
return sources; return sources;
} }
const hasSectionedImages = !!(await page.$('main.content')); const hasSectionedImages = !!(await page.$('main.content'));
console.log(hasSectionedImages); log.info(hasSectionedImages);
if(hasSectionedImages) { if(hasSectionedImages) {
const imageSelector = 'main.content > div > div > div > a > img'; const imageSelector = 'main.content > div > div > div > a > img';
const images = await page.$$(imageSelector); const images = await page.$$(imageSelector);
console.log(images.length) log.info(images.length)
sources = await Promise.all(images.map(async carouselItem => { sources = await Promise.all(images.map(async carouselItem => {
const src = await page.evaluate(el => el.getAttribute('src'), carouselItem); const src = await page.evaluate(el => el.getAttribute('src'), carouselItem);
// console.log(src);
return { url: src }; return { url: src };
})); }));
return sources; return sources;
@ -100,7 +98,6 @@ async function classicautomall(page) {
const sources = await Promise.all(items.map(async carouselItem => { const sources = await Promise.all(items.map(async carouselItem => {
const src = await page.evaluate(el => el.getAttribute('src'), carouselItem); const src = await page.evaluate(el => el.getAttribute('src'), carouselItem);
// console.log(src);
return { url: src }; return { url: src };
})); }));
return sources; return sources;
@ -116,7 +113,6 @@ async function gatewayclassiccars(page) {
const images = await page.$$('.photo'); const images = await page.$$('.photo');
const sources = await Promise.all(images.map(async carouselItem => { const sources = await Promise.all(images.map(async carouselItem => {
const src = await page.evaluate(el => el.getAttribute('src'), carouselItem); const src = await page.evaluate(el => el.getAttribute('src'), carouselItem);
// console.log(src);
return { url: src }; return { url: src };
})); }));
return sources; return sources;
@ -130,7 +126,6 @@ async function autoScroll(page){
var scrollHeight = document.body.scrollHeight; var scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance); window.scrollBy(0, distance);
totalHeight += distance; totalHeight += distance;
console.log(totalHeight);
if(totalHeight >= scrollHeight - window.innerHeight){ if(totalHeight >= scrollHeight - window.innerHeight){
clearInterval(timer); clearInterval(timer);
resolve(); resolve();

@ -1,4 +1,6 @@
const uuid = require('uuid'); const uuid = require('uuid');
const {log} = require('clew-logger');
module.exports = function (processorConfig) { module.exports = function (processorConfig) {
downloadGallery = async (vin) => { downloadGallery = async (vin) => {
@ -7,7 +9,6 @@ module.exports = function (processorConfig) {
var imgWrap; var imgWrap;
do { do {
imgWrap = document.elementFromPoint(100, 100); imgWrap = document.elementFromPoint(100, 100);
console.log(imgWrap.classList);
await delay(50); await delay(50);
} while (imgWrap.classList.contains('pswp__img--placeholder')); } while (imgWrap.classList.contains('pswp__img--placeholder'));
@ -78,7 +79,7 @@ module.exports = function (processorConfig) {
} }
return async function (page) { return async function (page) {
console.log('Running generic boostrap extractor') log.info('Running generic boostrap extractor')
const allResultsSelector = processorConfig.pageLoadIndicator; const allResultsSelector = processorConfig.pageLoadIndicator;
await page.waitForSelector(allResultsSelector); await page.waitForSelector(allResultsSelector);
@ -92,12 +93,12 @@ module.exports = function (processorConfig) {
console.error('Unable to grab VIN, falling back to UUID'); console.error('Unable to grab VIN, falling back to UUID');
vin = uuid.v4(); vin = uuid.v4();
}); });
console.log(vin); log.info(vin);
const firstImageLinkSelector = processorConfig.carouselTrigger; const firstImageLinkSelector = processorConfig.carouselTrigger;
await page.click(firstImageLinkSelector); await page.click(firstImageLinkSelector);
await page.waitForSelector('.pswp__img') await page.waitForSelector('.pswp__img')
console.log('Gallery is loaded, fetching URLS') log.info('Gallery is loaded, fetching URLS')
const galleryUrls = await page.evaluate(downloadGallery, vin); const galleryUrls = await page.evaluate(downloadGallery, vin);

@ -1,17 +1,18 @@
const _ = require('lodash'); const _ = require('lodash');
const {log} = require('clew-logger');
module.exports = function (config) { module.exports = function (config) {
return async function (page) { return async function (page) {
const pageLoadIndicator = '.show-car-thumbs'; const pageLoadIndicator = '.show-car-thumbs';
await page.waitForSelector(pageLoadIndicator); await page.waitForSelector(pageLoadIndicator);
const imageSelector = '.show-car-thumbs > a'; const imageSelector = '.show-car-thumbs > a';
const images = await page.$$(imageSelector); const images = await page.$$(imageSelector);
console.log(`Found ${images.length} images...`) log.info(`Found ${images.length} images...`)
const sources = await Promise.all(images.map(async carouselItem => { const sources = await Promise.all(images.map(async carouselItem => {
const src = await page.evaluate(el => el.getAttribute('data-original'), carouselItem); const src = await page.evaluate(el => el.getAttribute('data-original'), carouselItem);
if(!src) { if(!src) {
return {} return {}
} }
// console.log(src);
return { url: src }; return { url: src };
})); }));
return sources; return sources;

@ -1,4 +1,6 @@
const genericVinParserFactory = require("./generics/generic-vin-parser"); const genericVinParserFactory = require("./generics/generic-vin-parser");
const {log} = require('clew-logger');
module.exports = { module.exports = {
baseUrl: 'superstockamx.com', baseUrl: 'superstockamx.com',
async execute(page) { async execute(page) {
@ -11,7 +13,7 @@ module.exports = {
if(src.includes('base64')) { if(src.includes('base64')) {
//base 64 pasted image //base 64 pasted image
console.log('Found base64 image.') log.info('Found base64 image.')
const regex = /^data:(?<contentType>[^;]+);base64,(?<data>.+)/g; const regex = /^data:(?<contentType>[^;]+);base64,(?<data>.+)/g;
const matches = src.matchAll(regex); const matches = src.matchAll(regex);

@ -1,4 +1,6 @@
const genericVinParserFactory = require("./generics/generic-vin-parser"); const genericVinParserFactory = require("./generics/generic-vin-parser");
const {log} = require('clew-logger');
module.exports = { module.exports = {
baseUrl: 'topclassiccarsforsale.com', baseUrl: 'topclassiccarsforsale.com',
async execute(page) { async execute(page) {
@ -8,7 +10,7 @@ module.exports = {
const images = await page.$$(imageSelector); const images = await page.$$(imageSelector);
const sources = await Promise.all(images.map(async carouselItem => { const sources = await Promise.all(images.map(async carouselItem => {
const src = await page.evaluate(el => el.getAttribute('src'), carouselItem); const src = await page.evaluate(el => el.getAttribute('src'), carouselItem);
console.log(src); log.info(src);
return { url: this.baseUrl + src }; return { url: this.baseUrl + src };
})); }));
return sources; return sources;

Loading…
Cancel
Save