init repository
This commit is contained in:
parent
0dcfdf5900
commit
43db451ee3
143
main.js
Normal file
143
main.js
Normal file
@ -0,0 +1,143 @@
|
||||
const puppeteer = require('puppeteer');
|
||||
const readline = require('readline');
|
||||
const fs = require('fs');
|
||||
const needle = require('needle');
|
||||
const { execSync } = require('child_process');
|
||||
|
||||
async function readLine(label, muted=false) {
|
||||
const rl = readline.createInterface({
|
||||
input: process.stdin,
|
||||
output: process.stdout
|
||||
});
|
||||
rl.stdoutMuted = muted;
|
||||
rl._writeToOutput = function _writeToOutput(str) {
|
||||
if (rl.stdoutMuted)
|
||||
rl.output.write("*");
|
||||
else
|
||||
rl.output.write(str);
|
||||
};
|
||||
return new Promise(resolve => {
|
||||
rl.question(label + ': ', (answer) => {
|
||||
rl.close();
|
||||
resolve(answer);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
(async () => {
|
||||
const browser = await puppeteer.launch({headless: true});
|
||||
const page = await browser.newPage();
|
||||
page.setViewport({width: 1366, height: 768});
|
||||
await page.goto("https://bibliotheque.insa-lyon.fr/parcours-recherche/panorama_des_ressources");
|
||||
await Promise.all([
|
||||
// accéder à mon compte
|
||||
page.click('#boite_13 > a'),
|
||||
page.waitForNavigation(),
|
||||
]);
|
||||
await Promise.all([
|
||||
// s'identifier
|
||||
page.click('#boite_23 > div.widget-body.card-body > div > button'),
|
||||
page.waitForNavigation(),
|
||||
]);
|
||||
await page.click('#username');
|
||||
const username = await readLine('login');
|
||||
await page.keyboard.type(username);
|
||||
await page.click('#password');
|
||||
console.log('password:');
|
||||
const password = await readLine('', true);
|
||||
await page.keyboard.type(password);
|
||||
await Promise.all([
|
||||
// soumission formulaire identification CAS
|
||||
page.click('#fm1 > div.row.btn-row > input.btn-submit'),
|
||||
page.waitForNavigation(),
|
||||
]);
|
||||
await page.goto("https://docelec.insa-lyon.fr/login?url=http://nouveau.europresse.com/access/ip/default.aspx?un=INSAT_3");
|
||||
await page.goto("https://nouveau-europresse-com.docelec.insa-lyon.fr/webpages/Pdf/SearchForm.aspx");
|
||||
const journalNames = await page.evaluate(() => {
|
||||
let journalNames = [];
|
||||
document.querySelectorAll("#lbSources option").forEach((item) => {
|
||||
journalNames.push(item.value+' : '+item.innerText);
|
||||
});
|
||||
return journalNames;
|
||||
});
|
||||
for (let journalName of journalNames) console.log(journalName);
|
||||
const journalName = await readLine('newspaper id'); // "EC_P", "LF_P"
|
||||
await page.select("select#lbSources", journalName);
|
||||
await Promise.all([
|
||||
// ouverture popup pdf
|
||||
page.click('#btnSearch'),
|
||||
page.waitForNavigation(),
|
||||
]);
|
||||
await page.waitForTimeout(2000);
|
||||
const pages = await browser.pages();
|
||||
const pop = pages[pages.length - 1];
|
||||
await pop.setRequestInterception(true);
|
||||
countRequest = (() => {
|
||||
let count = 0;
|
||||
return async request => {
|
||||
if(request.url().indexOf('DocName=pdf')>0) {
|
||||
const options = { headers: request._headers, output: count+'.pdf' };
|
||||
const cookies = await pop.cookies();
|
||||
options.headers.Cookie = cookies.map(ck =>
|
||||
ck.name+'='+ck.value).join(';');
|
||||
needle.get(request._url, options);
|
||||
count++;
|
||||
} else {
|
||||
request.continue();
|
||||
}
|
||||
};
|
||||
})();
|
||||
pop.on('request', countRequest);
|
||||
const frameList = pop.frames().find((frame) => frame.name() === 'ListDoc');
|
||||
await frameList.waitForSelector('#listdoc');
|
||||
const pdfIds = await frameList.evaluate(() => {
|
||||
pdfIds = [];
|
||||
document.querySelectorAll("#listdoc a").forEach((item) => {
|
||||
pdfIds.push('#'+item.id);
|
||||
});
|
||||
return pdfIds;
|
||||
});
|
||||
//console.log(JSON.stringify(pdfIds, null, 4));
|
||||
let nbPages = 0;
|
||||
for (let id of pdfIds) {
|
||||
console.log('page: ' + id);
|
||||
await Promise.all([
|
||||
frameList.click(id),
|
||||
frameList.waitForTimeout(1000),
|
||||
]);
|
||||
nbPages++;
|
||||
}
|
||||
await browser.close();
|
||||
|
||||
let pdfFilenames = [];
|
||||
for (let i=0; i<nbPages; i++) {
|
||||
pdfFilenames.push(i+'.pdf');
|
||||
}
|
||||
|
||||
let dateTime = new Date();
|
||||
let day = ('0' + dateTime.getDate()).slice(-2);
|
||||
let month = ('0' + (dateTime.getMonth() + 1)).slice(-2);
|
||||
let year = dateTime.getFullYear();
|
||||
let today = year+'_'+month+'_'+day;
|
||||
let gsCmd = 'gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite ';
|
||||
gsCmd += '-dPDFSETTINGS=/prepress -sOutputFile=';
|
||||
gsCmd += today+'_'+journalName+'.pdf';
|
||||
for (let filename of pdfFilenames) gsCmd += ' '+filename;
|
||||
execSync(gsCmd, (error, stdout, stderr) => {
|
||||
if (error) {
|
||||
console.log(`error: ${error.message}`);
|
||||
}
|
||||
if (stderr) {
|
||||
console.log(`stderr: ${stderr}`);
|
||||
}
|
||||
console.log(`stdout: ${stdout}`);
|
||||
});
|
||||
|
||||
for (let filename of pdfFilenames) {
|
||||
try {
|
||||
fs.unlinkSync(filename);
|
||||
} catch (err) {
|
||||
console.error(err);
|
||||
}
|
||||
}
|
||||
})();
|
15
package.json
Normal file
15
package.json
Normal file
@ -0,0 +1,15 @@
|
||||
{
|
||||
"name": "scrapnews",
|
||||
"version": "1.0.0",
|
||||
"description": "scrap pdf of daily journals from europress",
|
||||
"main": "main.js",
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
},
|
||||
"author": "p6e7p7",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"needle": "^3.0.0",
|
||||
"puppeteer": "^13.0.0"
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user