126 lines
3.6 KiB
JavaScript
126 lines
3.6 KiB
JavaScript
|
|
import axios from 'axios';
|
||
|
|
import cheerio from 'cheerio';
|
||
|
|
|
||
|
|
class AmazonBookScraper {
|
||
|
|
constructor() {
|
||
|
|
this.baseUrl = 'https://www.amazon.it/dp/';
|
||
|
|
}
|
||
|
|
|
||
|
|
async fetchPage(isbn) {
|
||
|
|
const url = `${this.baseUrl}${isbn}`;
|
||
|
|
try {
|
||
|
|
const { data } = await axios.get(url, {
|
||
|
|
headers: {
|
||
|
|
'User-Agent':
|
||
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' +
|
||
|
|
'AppleWebKit/537.36 (KHTML, like Gecko) ' +
|
||
|
|
'Chrome/113.0.0.0 Safari/537.36',
|
||
|
|
// altri header se necessario
|
||
|
|
},
|
||
|
|
});
|
||
|
|
return data;
|
||
|
|
} catch (err) {
|
||
|
|
console.error(`Errore fetching ISBN ${isbn}:`, err.message);
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
extractData(html) {
|
||
|
|
const $ = cheerio.load(html);
|
||
|
|
|
||
|
|
// Titolo
|
||
|
|
let title = $('#productTitle').text().trim() || null;
|
||
|
|
|
||
|
|
// Sottotitolo (Amazon spesso lo mette in #productSubtitle o nel titolo, proveremo)
|
||
|
|
let subtitle = $('#productSubtitle').text().trim() || null;
|
||
|
|
|
||
|
|
// Numero pagine, formato, edizione
|
||
|
|
// Questi dati spesso sono nella tabella dettagli prodotto con id #detailBullets_feature_div o #productDetailsTable
|
||
|
|
// Proviamo a estrarre da #detailBullets_feature_div
|
||
|
|
|
||
|
|
let pages = null;
|
||
|
|
let format = null;
|
||
|
|
let edition = null;
|
||
|
|
|
||
|
|
$('#detailBullets_feature_div li').each((i, el) => {
|
||
|
|
const label = $(el).find('span.a-text-bold').text().trim().toLowerCase();
|
||
|
|
const value = $(el).find('span').last().text().trim();
|
||
|
|
|
||
|
|
if (label.includes('pagine') || label.includes('pagine stampate')) {
|
||
|
|
pages = value;
|
||
|
|
} else if (label.includes('formato')) {
|
||
|
|
format = value;
|
||
|
|
} else if (label.includes('edizione')) {
|
||
|
|
edition = value;
|
||
|
|
}
|
||
|
|
});
|
||
|
|
|
||
|
|
// fallback su #productDetailsTable (altro possibile layout)
|
||
|
|
if (!pages || !format || !edition) {
|
||
|
|
$('#productDetailsTable .content tr').each((i, el) => {
|
||
|
|
const label = $(el).find('th').text().trim().toLowerCase();
|
||
|
|
const value = $(el).find('td').text().trim();
|
||
|
|
|
||
|
|
if (!pages && (label.includes('pagine') || label.includes('pagine stampate'))) {
|
||
|
|
pages = value;
|
||
|
|
} else if (!format && label.includes('formato')) {
|
||
|
|
format = value;
|
||
|
|
} else if (!edition && label.includes('edizione')) {
|
||
|
|
edition = value;
|
||
|
|
}
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
return { title, subtitle, pages, format, edition };
|
||
|
|
}
|
||
|
|
|
||
|
|
async scrapeISBN(isbn) {
|
||
|
|
const html = await this.fetchPage(isbn);
|
||
|
|
if (!html) return null;
|
||
|
|
|
||
|
|
const data = this.extractData(html);
|
||
|
|
return data;
|
||
|
|
}
|
||
|
|
|
||
|
|
async scrapeMultiple(isbnList) {
|
||
|
|
const results = [];
|
||
|
|
for (const isbn of isbnList) {
|
||
|
|
console.log(`Scraping ISBN: ${isbn}`);
|
||
|
|
const data = await this.scrapeISBN(isbn);
|
||
|
|
results.push({ isbn, ...data });
|
||
|
|
// Per evitare blocchi, metti una pausa (es. 2 secondi)
|
||
|
|
await new Promise((r) => setTimeout(r, 2000));
|
||
|
|
}
|
||
|
|
return results;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
export async function ScraperDataAmazon(idapp, options) {
|
||
|
|
const scraper = new AmazonBookScraper();
|
||
|
|
const isbn = options.isbn;
|
||
|
|
|
||
|
|
try {
|
||
|
|
const data = await scraper.scrapeISBN(isbn);
|
||
|
|
console.log(data);
|
||
|
|
return data;
|
||
|
|
} catch (e) {
|
||
|
|
console.error(e);
|
||
|
|
return res.status(400).send({ code: server_constants.RIS_CODE_ERR, msg: '' });
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
export async function ScraperMultipleDataAmazon(idapp, options) {
|
||
|
|
const scraper = new AmazonBookScraper();
|
||
|
|
const isbnList = ['8850224248']; // metti i tuoi ISBN qui
|
||
|
|
|
||
|
|
try {
|
||
|
|
const books = await scraper.scrapeMultiple(isbnList);
|
||
|
|
console.log(books);
|
||
|
|
} catch (e) {
|
||
|
|
console.error(e);
|
||
|
|
return res.status(400).send({ code: server_constants.RIS_CODE_ERR, msg: '' });
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
export default AmazonBookScraper;
|