init repository
This commit is contained in:
parent
0dcfdf5900
commit
43db451ee3
143
main.js
Normal file
143
main.js
Normal file
@ -0,0 +1,143 @@
|
|||||||
|
const puppeteer = require('puppeteer');
|
||||||
|
const readline = require('readline');
|
||||||
|
const fs = require('fs');
|
||||||
|
const needle = require('needle');
|
||||||
|
const { execSync } = require('child_process');
|
||||||
|
|
||||||
|
async function readLine(label, muted=false) {
|
||||||
|
const rl = readline.createInterface({
|
||||||
|
input: process.stdin,
|
||||||
|
output: process.stdout
|
||||||
|
});
|
||||||
|
rl.stdoutMuted = muted;
|
||||||
|
rl._writeToOutput = function _writeToOutput(str) {
|
||||||
|
if (rl.stdoutMuted)
|
||||||
|
rl.output.write("*");
|
||||||
|
else
|
||||||
|
rl.output.write(str);
|
||||||
|
};
|
||||||
|
return new Promise(resolve => {
|
||||||
|
rl.question(label + ': ', (answer) => {
|
||||||
|
rl.close();
|
||||||
|
resolve(answer);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
(async () => {
|
||||||
|
const browser = await puppeteer.launch({headless: true});
|
||||||
|
const page = await browser.newPage();
|
||||||
|
page.setViewport({width: 1366, height: 768});
|
||||||
|
await page.goto("https://bibliotheque.insa-lyon.fr/parcours-recherche/panorama_des_ressources");
|
||||||
|
await Promise.all([
|
||||||
|
// accéder à mon compte
|
||||||
|
page.click('#boite_13 > a'),
|
||||||
|
page.waitForNavigation(),
|
||||||
|
]);
|
||||||
|
await Promise.all([
|
||||||
|
// s'identifier
|
||||||
|
page.click('#boite_23 > div.widget-body.card-body > div > button'),
|
||||||
|
page.waitForNavigation(),
|
||||||
|
]);
|
||||||
|
await page.click('#username');
|
||||||
|
const username = await readLine('login');
|
||||||
|
await page.keyboard.type(username);
|
||||||
|
await page.click('#password');
|
||||||
|
console.log('password:');
|
||||||
|
const password = await readLine('', true);
|
||||||
|
await page.keyboard.type(password);
|
||||||
|
await Promise.all([
|
||||||
|
// soumission formulaire identification CAS
|
||||||
|
page.click('#fm1 > div.row.btn-row > input.btn-submit'),
|
||||||
|
page.waitForNavigation(),
|
||||||
|
]);
|
||||||
|
await page.goto("https://docelec.insa-lyon.fr/login?url=http://nouveau.europresse.com/access/ip/default.aspx?un=INSAT_3");
|
||||||
|
await page.goto("https://nouveau-europresse-com.docelec.insa-lyon.fr/webpages/Pdf/SearchForm.aspx");
|
||||||
|
const journalNames = await page.evaluate(() => {
|
||||||
|
let journalNames = [];
|
||||||
|
document.querySelectorAll("#lbSources option").forEach((item) => {
|
||||||
|
journalNames.push(item.value+' : '+item.innerText);
|
||||||
|
});
|
||||||
|
return journalNames;
|
||||||
|
});
|
||||||
|
for (let journalName of journalNames) console.log(journalName);
|
||||||
|
const journalName = await readLine('newspaper id'); // "EC_P", "LF_P"
|
||||||
|
await page.select("select#lbSources", journalName);
|
||||||
|
await Promise.all([
|
||||||
|
// ouverture popup pdf
|
||||||
|
page.click('#btnSearch'),
|
||||||
|
page.waitForNavigation(),
|
||||||
|
]);
|
||||||
|
await page.waitForTimeout(2000);
|
||||||
|
const pages = await browser.pages();
|
||||||
|
const pop = pages[pages.length - 1];
|
||||||
|
await pop.setRequestInterception(true);
|
||||||
|
countRequest = (() => {
|
||||||
|
let count = 0;
|
||||||
|
return async request => {
|
||||||
|
if(request.url().indexOf('DocName=pdf')>0) {
|
||||||
|
const options = { headers: request._headers, output: count+'.pdf' };
|
||||||
|
const cookies = await pop.cookies();
|
||||||
|
options.headers.Cookie = cookies.map(ck =>
|
||||||
|
ck.name+'='+ck.value).join(';');
|
||||||
|
needle.get(request._url, options);
|
||||||
|
count++;
|
||||||
|
} else {
|
||||||
|
request.continue();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
})();
|
||||||
|
pop.on('request', countRequest);
|
||||||
|
const frameList = pop.frames().find((frame) => frame.name() === 'ListDoc');
|
||||||
|
await frameList.waitForSelector('#listdoc');
|
||||||
|
const pdfIds = await frameList.evaluate(() => {
|
||||||
|
pdfIds = [];
|
||||||
|
document.querySelectorAll("#listdoc a").forEach((item) => {
|
||||||
|
pdfIds.push('#'+item.id);
|
||||||
|
});
|
||||||
|
return pdfIds;
|
||||||
|
});
|
||||||
|
//console.log(JSON.stringify(pdfIds, null, 4));
|
||||||
|
let nbPages = 0;
|
||||||
|
for (let id of pdfIds) {
|
||||||
|
console.log('page: ' + id);
|
||||||
|
await Promise.all([
|
||||||
|
frameList.click(id),
|
||||||
|
frameList.waitForTimeout(1000),
|
||||||
|
]);
|
||||||
|
nbPages++;
|
||||||
|
}
|
||||||
|
await browser.close();
|
||||||
|
|
||||||
|
let pdfFilenames = [];
|
||||||
|
for (let i=0; i<nbPages; i++) {
|
||||||
|
pdfFilenames.push(i+'.pdf');
|
||||||
|
}
|
||||||
|
|
||||||
|
let dateTime = new Date();
|
||||||
|
let day = ('0' + dateTime.getDate()).slice(-2);
|
||||||
|
let month = ('0' + (dateTime.getMonth() + 1)).slice(-2);
|
||||||
|
let year = dateTime.getFullYear();
|
||||||
|
let today = year+'_'+month+'_'+day;
|
||||||
|
let gsCmd = 'gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite ';
|
||||||
|
gsCmd += '-dPDFSETTINGS=/prepress -sOutputFile=';
|
||||||
|
gsCmd += today+'_'+journalName+'.pdf';
|
||||||
|
for (let filename of pdfFilenames) gsCmd += ' '+filename;
|
||||||
|
execSync(gsCmd, (error, stdout, stderr) => {
|
||||||
|
if (error) {
|
||||||
|
console.log(`error: ${error.message}`);
|
||||||
|
}
|
||||||
|
if (stderr) {
|
||||||
|
console.log(`stderr: ${stderr}`);
|
||||||
|
}
|
||||||
|
console.log(`stdout: ${stdout}`);
|
||||||
|
});
|
||||||
|
|
||||||
|
for (let filename of pdfFilenames) {
|
||||||
|
try {
|
||||||
|
fs.unlinkSync(filename);
|
||||||
|
} catch (err) {
|
||||||
|
console.error(err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})();
|
15
package.json
Normal file
15
package.json
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
{
|
||||||
|
"name": "scrapnews",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"description": "scrap pdf of daily journals from europress",
|
||||||
|
"main": "main.js",
|
||||||
|
"scripts": {
|
||||||
|
"test": "echo \"Error: no test specified\" && exit 1"
|
||||||
|
},
|
||||||
|
"author": "p6e7p7",
|
||||||
|
"license": "ISC",
|
||||||
|
"dependencies": {
|
||||||
|
"needle": "^3.0.0",
|
||||||
|
"puppeteer": "^13.0.0"
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user