init repository

This commit is contained in:
Pierre-Edouard Portier 2021-12-21 14:13:20 +01:00
parent 0dcfdf5900
commit 43db451ee3
2 changed files with 158 additions and 0 deletions

143
main.js Normal file
View File

@ -0,0 +1,143 @@
const puppeteer = require('puppeteer');
const readline = require('readline');
const fs = require('fs');
const needle = require('needle');
const { execSync } = require('child_process');
async function readLine(label, muted=false) {
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout
});
rl.stdoutMuted = muted;
rl._writeToOutput = function _writeToOutput(str) {
if (rl.stdoutMuted)
rl.output.write("*");
else
rl.output.write(str);
};
return new Promise(resolve => {
rl.question(label + ': ', (answer) => {
rl.close();
resolve(answer);
});
});
}
(async () => {
const browser = await puppeteer.launch({headless: true});
const page = await browser.newPage();
page.setViewport({width: 1366, height: 768});
await page.goto("https://bibliotheque.insa-lyon.fr/parcours-recherche/panorama_des_ressources");
await Promise.all([
// accéder à mon compte
page.click('#boite_13 > a'),
page.waitForNavigation(),
]);
await Promise.all([
// s'identifier
page.click('#boite_23 > div.widget-body.card-body > div > button'),
page.waitForNavigation(),
]);
await page.click('#username');
const username = await readLine('login');
await page.keyboard.type(username);
await page.click('#password');
console.log('password:');
const password = await readLine('', true);
await page.keyboard.type(password);
await Promise.all([
// soumission formulaire identification CAS
page.click('#fm1 > div.row.btn-row > input.btn-submit'),
page.waitForNavigation(),
]);
await page.goto("https://docelec.insa-lyon.fr/login?url=http://nouveau.europresse.com/access/ip/default.aspx?un=INSAT_3");
await page.goto("https://nouveau-europresse-com.docelec.insa-lyon.fr/webpages/Pdf/SearchForm.aspx");
const journalNames = await page.evaluate(() => {
let journalNames = [];
document.querySelectorAll("#lbSources option").forEach((item) => {
journalNames.push(item.value+' : '+item.innerText);
});
return journalNames;
});
for (let journalName of journalNames) console.log(journalName);
const journalName = await readLine('newspaper id'); // "EC_P", "LF_P"
await page.select("select#lbSources", journalName);
await Promise.all([
// ouverture popup pdf
page.click('#btnSearch'),
page.waitForNavigation(),
]);
await page.waitForTimeout(2000);
const pages = await browser.pages();
const pop = pages[pages.length - 1];
await pop.setRequestInterception(true);
countRequest = (() => {
let count = 0;
return async request => {
if(request.url().indexOf('DocName=pdf')>0) {
const options = { headers: request._headers, output: count+'.pdf' };
const cookies = await pop.cookies();
options.headers.Cookie = cookies.map(ck =>
ck.name+'='+ck.value).join(';');
needle.get(request._url, options);
count++;
} else {
request.continue();
}
};
})();
pop.on('request', countRequest);
const frameList = pop.frames().find((frame) => frame.name() === 'ListDoc');
await frameList.waitForSelector('#listdoc');
const pdfIds = await frameList.evaluate(() => {
pdfIds = [];
document.querySelectorAll("#listdoc a").forEach((item) => {
pdfIds.push('#'+item.id);
});
return pdfIds;
});
//console.log(JSON.stringify(pdfIds, null, 4));
let nbPages = 0;
for (let id of pdfIds) {
console.log('page: ' + id);
await Promise.all([
frameList.click(id),
frameList.waitForTimeout(1000),
]);
nbPages++;
}
await browser.close();
let pdfFilenames = [];
for (let i=0; i<nbPages; i++) {
pdfFilenames.push(i+'.pdf');
}
let dateTime = new Date();
let day = ('0' + dateTime.getDate()).slice(-2);
let month = ('0' + (dateTime.getMonth() + 1)).slice(-2);
let year = dateTime.getFullYear();
let today = year+'_'+month+'_'+day;
let gsCmd = 'gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite ';
gsCmd += '-dPDFSETTINGS=/prepress -sOutputFile=';
gsCmd += today+'_'+journalName+'.pdf';
for (let filename of pdfFilenames) gsCmd += ' '+filename;
execSync(gsCmd, (error, stdout, stderr) => {
if (error) {
console.log(`error: ${error.message}`);
}
if (stderr) {
console.log(`stderr: ${stderr}`);
}
console.log(`stdout: ${stdout}`);
});
for (let filename of pdfFilenames) {
try {
fs.unlinkSync(filename);
} catch (err) {
console.error(err);
}
}
})();

15
package.json Normal file
View File

@ -0,0 +1,15 @@
{
"name": "scrapnews",
"version": "1.0.0",
"description": "scrap pdf of daily journals from europress",
"main": "main.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "p6e7p7",
"license": "ISC",
"dependencies": {
"needle": "^3.0.0",
"puppeteer": "^13.0.0"
}
}