Added new processors

crawler
Edward Peterson 3 years ago
parent 7d0fb98baa
commit 79c0749606

@ -22,14 +22,15 @@ app.use(bodyParser.json())
app.post('/convertGalleryToHar', async (req, res) => { app.post('/convertGalleryToHar', async (req, res) => {
const url = req.body.url; const url = req.body.url;
console.log(url); console.log(url);
// get processor // get processor
const processor = _.find(processors, (processor) => url.includes(processor.baseUrl)); const processor = _.find(processors, (processor) => ((new URL(url)).hostname) === (processor.baseUrl));
if (!processor) { if (!processor) {
return res.status(400).json({ return res.status(400).json({
message: 'Could not find processor for url' message: 'Could not find processor for url'
}) })
} }
console.log('Processor found'); console.log('Processor found', processor.baseUrl);
try { try {
const payloads = await run(url, processor); const payloads = await run(url, processor);
res.status(200).json({ res.status(200).json({
@ -44,6 +45,9 @@ app.post('/convertGalleryToHar', async (req, res) => {
} }
}) })
app.get('/supportedSites', async(req, res) => {
res.status(200).json(processors.map(p => p.baseUrl));
})
app.listen(2667); app.listen(2667);
async function run(url, processor) { async function run(url, processor) {

@ -0,0 +1,17 @@
module.exports = {
baseUrl: 'davidsclassiccars.com',
execute: async function(page) {
const pageLoadIndicator = '.bx-viewport';
await page.waitForSelector(pageLoadIndicator);
const imageSelector = '.carimage > img';
const images = await page.$$(imageSelector);
console.log(`Found ${images.length} images...`)
const sources = await Promise.all(images.map(async carouselItem => {
const src = await page.evaluate(el => el.getAttribute('src'), carouselItem);
// console.log(src);
return { url: this.baseUrl + src };
}));
return sources;
}
}

@ -68,10 +68,6 @@ module.exports = function (processorConfig) {
return async function (page) { return async function (page) {
console.log('Running generic boostrap extractor') console.log('Running generic boostrap extractor')
// Type into search box.
// await page.type('.devsite-search-field', 'Headless Chrome');
// Wait for suggest overlay to appear and click "show all results".
const allResultsSelector = processorConfig.pageLoadIndicator; const allResultsSelector = processorConfig.pageLoadIndicator;
await page.waitForSelector(allResultsSelector); await page.waitForSelector(allResultsSelector);
var vin; var vin;
@ -85,13 +81,7 @@ module.exports = function (processorConfig) {
vin = uuid.v4(); vin = uuid.v4();
}); });
console.log(vin); console.log(vin);
// await page.click(allResultsSelector);
const client = await page.target().createCDPSession()
await client.send('Page.setDownloadBehavior', {
behavior: 'allow',
downloadPath: './images',
})
const firstImageLinkSelector = processorConfig.carouselTrigger; const firstImageLinkSelector = processorConfig.carouselTrigger;
await page.click(firstImageLinkSelector); await page.click(firstImageLinkSelector);
await page.waitForSelector('.pswp__img') await page.waitForSelector('.pswp__img')

@ -0,0 +1,12 @@
const genericBootstrapFactory = require("./generic-bootstrap")
module.exports = {
execute: genericBootstrapFactory({
baseUrl: 'hemmings.com',
pageLoadIndicator: '#galleries',
vinSelector: '.leading-loose > li:nth-child(2) > span:nth-child(2)',
carouselTrigger: '.bg-center'
}),
baseUrl :'hemmings.com'
}

@ -0,0 +1,15 @@
module.exports = {
baseUrl: 'topclassiccarsforsale.com',
async execute(page) {
const gallerySelector = '.bx-viewport'
const imageSelector = '.bx-viewport > ul > li:not(.bx-clone) > img';
await page.waitForSelector(gallerySelector);
const images = await page.$$(imageSelector);
const sources = await Promise.all(images.map(async carouselItem => {
const src = await page.evaluate(el => el.getAttribute('src'), carouselItem);
console.log(src);
return { url: this.baseUrl + src };
}));
return sources;
}
}
Loading…
Cancel
Save