Added API endpoints to trigger process
parent
5e981cedca
commit
be1ab59bac
@ -1,158 +1,85 @@
|
|||||||
const puppeteer = require('puppeteer')
|
const puppeteer = require('puppeteer')
|
||||||
const fs = require('fs');
|
|
||||||
const superagent = require('superagent');
|
const superagent = require('superagent');
|
||||||
const path = require('path');
|
const path = require('path');
|
||||||
const uuid = require('uuid');
|
const express = require('express');
|
||||||
|
const glob = require('glob');
|
||||||
|
const _ = require('lodash');
|
||||||
|
const app = express();
|
||||||
|
var bodyParser = require('body-parser')
|
||||||
|
|
||||||
var url = '';
|
|
||||||
process.argv.forEach((val, index, array) => {
|
|
||||||
console.log(val)
|
|
||||||
if(val.startsWith('url=')){
|
|
||||||
url = val.substring(4);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
const processors = [];
|
||||||
|
//load the processor files
|
||||||
|
glob('processors/*.processor.js', (error, matches) => {
|
||||||
|
console.log(matches);
|
||||||
|
_.forEach(matches, file => {
|
||||||
|
const processor = require(path.resolve('./', file));
|
||||||
|
processors.push(processor);
|
||||||
|
})
|
||||||
|
console.log(`${matches.length} processors loaded`);
|
||||||
})
|
})
|
||||||
if(!url || url.length === 0) {
|
app.use(bodyParser.json())
|
||||||
console.log('Need to supply url');
|
app.post('/convertGalleryToHar', async (req, res) => {
|
||||||
process.exit();
|
const url = req.body.url;
|
||||||
}
|
console.log(url);
|
||||||
|
// get processor
|
||||||
|
const processor = _.find(processors, (processor) => url.includes(processor.baseUrl));
|
||||||
const processors = [{
|
if (!processor) {
|
||||||
baseUrl: 'bringatrailer.com',
|
return res.status(400).json({
|
||||||
pageLoadIndicator: '.gallery',
|
message: 'Could not find processor for url'
|
||||||
vinSelector: 'body > main > div.listing > div:nth-child(3) > div.column.column-right.column-right-force > div.essentials > div:nth-child(5) > ul > li:nth-child(1) > a',
|
})
|
||||||
carouselTrigger: '.gallery > a:nth-child(1)'
|
}
|
||||||
|
console.log('Processor found');
|
||||||
|
try {
|
||||||
|
const payloads = await run(url, processor);
|
||||||
|
console.log(payloads);
|
||||||
|
|
||||||
}, {
|
res.status(200).json({
|
||||||
baseUrl: 'classiccars.com',
|
log: {
|
||||||
pageLoadIndicator: '#ListingCarousel',
|
entries: payloads
|
||||||
vinSelector: 'li.p-vin > span:nth-child(2)',
|
}
|
||||||
carouselTrigger: 'div.swiper-slide-active > div > img.u-photo'
|
// payloads
|
||||||
}]
|
})
|
||||||
var processorConfig = {}
|
} catch (error) {
|
||||||
processors.forEach(config => {
|
console.log(error);
|
||||||
if(url.includes(config.baseUrl)){
|
res.status(500).json(error);
|
||||||
processorConfig = config;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
})
|
})
|
||||||
if(!processorConfig) {
|
app.listen(2667);
|
||||||
console.log('Unsupported URL');
|
|
||||||
process.exit();
|
|
||||||
}
|
|
||||||
|
|
||||||
const run = async () => {
|
async function run(url, processor) {
|
||||||
const browser = await puppeteer.launch({
|
const browser = await puppeteer.launch({
|
||||||
headless:true
|
headless: true
|
||||||
});
|
});
|
||||||
|
|
||||||
const page = await browser.newPage();
|
|
||||||
|
|
||||||
await page.goto(url);
|
|
||||||
|
|
||||||
// Type into search box.
|
|
||||||
// await page.type('.devsite-search-field', 'Headless Chrome');
|
|
||||||
|
|
||||||
// Wait for suggest overlay to appear and click "show all results".
|
|
||||||
const allResultsSelector = processorConfig.pageLoadIndicator;
|
|
||||||
await page.waitForSelector(allResultsSelector);
|
|
||||||
var vin;
|
|
||||||
|
|
||||||
const vinSelector = processorConfig.vinSelector;
|
const page = await browser.newPage();
|
||||||
await page.waitForSelector(vinSelector).then(async () => {
|
console.log('Loading page...');
|
||||||
let element = await page.$(vinSelector)
|
await page.goto(url, { timeout: 60000 });
|
||||||
vin = await page.evaluate(el => el.textContent, element);
|
console.log('page loaded')
|
||||||
}).catch(error => {
|
const galleryUrls = await processor.execute(page);
|
||||||
console.error('Unable to grab VIN, falling back to UUID');
|
|
||||||
vin = uuid.v4();
|
|
||||||
});
|
|
||||||
console.log(vin);
|
|
||||||
// await page.click(allResultsSelector);
|
|
||||||
|
|
||||||
const client = await page.target().createCDPSession()
|
|
||||||
await client.send('Page.setDownloadBehavior', {
|
|
||||||
behavior: 'allow',
|
|
||||||
downloadPath: './images',
|
|
||||||
})
|
|
||||||
const firstImageLinkSelector = processorConfig.carouselTrigger;
|
|
||||||
await page.click(firstImageLinkSelector);
|
|
||||||
await page.waitForSelector('.pswp__img')
|
|
||||||
console.log('Gallery is loaded, fetching URLS')
|
|
||||||
const galleryUrls = await page.evaluate(downloadGallery, vin);
|
|
||||||
console.log('Done collecting URLS', galleryUrls.length);
|
|
||||||
await Promise.all(galleryUrls.map(image => new Promise(async (resolve, reject) => {
|
|
||||||
const fileType = path.extname(image.url);
|
|
||||||
var stream = fs.createWriteStream(path.resolve('images', `${image.fileName}${fileType}`));
|
|
||||||
stream.on('finish', resolve);
|
|
||||||
superagent.get(image.url).pipe(stream);
|
|
||||||
})))
|
|
||||||
console.log('URLS done downloading')
|
|
||||||
|
|
||||||
await browser.close();
|
|
||||||
};
|
|
||||||
|
|
||||||
downloadGallery = async (vin) => {
|
console.log('Done collecting URLS', galleryUrls.length);
|
||||||
var galleryUrls = [];
|
const payloads = await Promise.all(galleryUrls.map(image => new Promise(async (resolve, reject) => {
|
||||||
async function ensureCarouselVisible() {
|
superagent.get(image.url).responseType('blob').then(function (response) {
|
||||||
var imgWrap;
|
if (response.statusCode == 200) {
|
||||||
do {
|
return resolve({
|
||||||
imgWrap = document.elementFromPoint(100, 100);
|
response: {
|
||||||
console.log(imgWrap.classList);
|
content: {
|
||||||
await delay(50);
|
mimeType: response.headers["content-type"],
|
||||||
} while(imgWrap.classList.contains('pswp__img--placeholder'));
|
encoding: 'base64',
|
||||||
|
text: response.body.toString('base64')
|
||||||
}
|
}
|
||||||
|
}
|
||||||
function delay(ms) {
|
});
|
||||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Find the active image, surround it in an anchor tag, then click it.
|
|
||||||
async function downloadImage(id) {
|
|
||||||
const imgWrap = document.elementFromPoint(300, 300);
|
|
||||||
const children = imgWrap.querySelectorAll('.pswp__img');
|
|
||||||
const img = children[children.length - 1];
|
|
||||||
console.log(img, imgWrap);
|
|
||||||
// debugger;
|
|
||||||
// Full image hasn't loaded yet
|
|
||||||
const src = imgWrap.src.split('?')[0]// get rid of querystring
|
|
||||||
return downloadSrc(src, id);
|
|
||||||
}
|
|
||||||
|
|
||||||
function downloadSrc(src, id) {
|
|
||||||
galleryUrls.push({
|
|
||||||
fileName: id,
|
|
||||||
url:src
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
function nextImage() {
|
|
||||||
document.querySelector('.pswp__button.pswp__button--arrow--right').click();
|
|
||||||
}
|
}
|
||||||
|
reject();
|
||||||
function getCounterValue() {
|
});
|
||||||
const [position, total] = document.querySelector('.pswp__counter').textContent.split('/');
|
})))
|
||||||
return parseInt(position.trim(), 10);
|
console.log('URLS done downloading')
|
||||||
}
|
await browser.close();
|
||||||
|
return payloads;
|
||||||
function run() {
|
}
|
||||||
return new Promise(async (resolve, reject) => {
|
|
||||||
await ensureCarouselVisible();
|
|
||||||
await delay(500);
|
|
||||||
const firstValue = getCounterValue();
|
|
||||||
do {
|
|
||||||
await downloadImage(`${vin}-${getCounterValue()}`);
|
|
||||||
nextImage();
|
|
||||||
await delay(100);
|
|
||||||
} while(getCounterValue() !== firstValue);
|
|
||||||
return resolve(galleryUrls)
|
|
||||||
})
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
return new Promise((resolve, reject) => {
|
|
||||||
run().then(resolve);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
run();
|
// run();
|
||||||
@ -0,0 +1,11 @@
|
|||||||
|
const genericBootstrapFactory = require("./generic-bootstrap")
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
execute: genericBootstrapFactory({
|
||||||
|
pageLoadIndicator: '.gallery',
|
||||||
|
vinSelector: 'body > main > div.listing > div:nth-child(3) > div.column.column-right.column-right-force > div.essentials > div:nth-child(5) > ul > li:nth-child(1) > a',
|
||||||
|
carouselTrigger: '.gallery > a:nth-child(1)'
|
||||||
|
|
||||||
|
}),
|
||||||
|
baseUrl: 'bringatrailer.com'
|
||||||
|
}
|
||||||
@ -0,0 +1,11 @@
|
|||||||
|
const genericBootstrapFactory = require("./generic-bootstrap")
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
execute: genericBootstrapFactory({
|
||||||
|
baseUrl: 'classiccars.com',
|
||||||
|
pageLoadIndicator: '#ListingCarousel',
|
||||||
|
vinSelector: 'li.p-vin > span:nth-child(2)',
|
||||||
|
carouselTrigger: 'div.swiper-slide-active > div > img.u-photo'
|
||||||
|
}),
|
||||||
|
baseUrl :'classiccars.com'
|
||||||
|
}
|
||||||
@ -0,0 +1,105 @@
|
|||||||
|
const uuid = require('uuid');
|
||||||
|
module.exports = function (processorConfig) {
|
||||||
|
|
||||||
|
downloadGallery = async (vin) => {
|
||||||
|
var galleryUrls = [];
|
||||||
|
async function ensureCarouselVisible() {
|
||||||
|
var imgWrap;
|
||||||
|
do {
|
||||||
|
imgWrap = document.elementFromPoint(100, 100);
|
||||||
|
console.log(imgWrap.classList);
|
||||||
|
await delay(50);
|
||||||
|
} while (imgWrap.classList.contains('pswp__img--placeholder'));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
function delay(ms) {
|
||||||
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find the active image, surround it in an anchor tag, then click it.
|
||||||
|
async function downloadImage(id) {
|
||||||
|
const imgWrap = document.elementFromPoint(300, 300);
|
||||||
|
const children = imgWrap.querySelectorAll('.pswp__img');
|
||||||
|
const img = children[children.length - 1];
|
||||||
|
console.log(img, imgWrap);
|
||||||
|
// debugger;
|
||||||
|
// Full image hasn't loaded yet
|
||||||
|
const src = imgWrap.src.split('?')[0]// get rid of querystring
|
||||||
|
return downloadSrc(src, id);
|
||||||
|
}
|
||||||
|
|
||||||
|
function downloadSrc(src, id) {
|
||||||
|
galleryUrls.push({
|
||||||
|
fileName: id,
|
||||||
|
url: src
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
function nextImage() {
|
||||||
|
document.querySelector('.pswp__button.pswp__button--arrow--right').click();
|
||||||
|
}
|
||||||
|
|
||||||
|
function getCounterValue() {
|
||||||
|
const [position, total] = document.querySelector('.pswp__counter').textContent.split('/');
|
||||||
|
return parseInt(position.trim(), 10);
|
||||||
|
}
|
||||||
|
|
||||||
|
function run() {
|
||||||
|
return new Promise(async (resolve, reject) => {
|
||||||
|
await ensureCarouselVisible();
|
||||||
|
await delay(500);
|
||||||
|
const firstValue = getCounterValue();
|
||||||
|
do {
|
||||||
|
await downloadImage(`${vin}-${getCounterValue()}`);
|
||||||
|
nextImage();
|
||||||
|
await delay(100);
|
||||||
|
} while (getCounterValue() !== firstValue);
|
||||||
|
return resolve(galleryUrls)
|
||||||
|
})
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
run().then(resolve);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return async function (page) {
|
||||||
|
console.log('Running generic boostrap extractor')
|
||||||
|
|
||||||
|
// Type into search box.
|
||||||
|
// await page.type('.devsite-search-field', 'Headless Chrome');
|
||||||
|
|
||||||
|
// Wait for suggest overlay to appear and click "show all results".
|
||||||
|
const allResultsSelector = processorConfig.pageLoadIndicator;
|
||||||
|
await page.waitForSelector(allResultsSelector);
|
||||||
|
var vin;
|
||||||
|
|
||||||
|
const vinSelector = processorConfig.vinSelector;
|
||||||
|
await page.waitForSelector(vinSelector).then(async () => {
|
||||||
|
let element = await page.$(vinSelector)
|
||||||
|
vin = await page.evaluate(el => el.textContent, element);
|
||||||
|
}).catch(error => {
|
||||||
|
console.error('Unable to grab VIN, falling back to UUID');
|
||||||
|
vin = uuid.v4();
|
||||||
|
});
|
||||||
|
console.log(vin);
|
||||||
|
// await page.click(allResultsSelector);
|
||||||
|
|
||||||
|
const client = await page.target().createCDPSession()
|
||||||
|
await client.send('Page.setDownloadBehavior', {
|
||||||
|
behavior: 'allow',
|
||||||
|
downloadPath: './images',
|
||||||
|
})
|
||||||
|
const firstImageLinkSelector = processorConfig.carouselTrigger;
|
||||||
|
await page.click(firstImageLinkSelector);
|
||||||
|
await page.waitForSelector('.pswp__img')
|
||||||
|
console.log('Gallery is loaded, fetching URLS')
|
||||||
|
const galleryUrls = await page.evaluate(downloadGallery, vin);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
return galleryUrls
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue