From 43db451ee3e373784a91346db10d1b2b02a61766 Mon Sep 17 00:00:00 2001 From: Pierre-Edouard Portier Date: Tue, 21 Dec 2021 14:13:20 +0100 Subject: [PATCH] init repository --- main.js | 143 +++++++++++++++++++++++++++++++++++++++++++++++++++ package.json | 15 ++++++ 2 files changed, 158 insertions(+) create mode 100644 main.js create mode 100644 package.json diff --git a/main.js b/main.js new file mode 100644 index 0000000..4d83af2 --- /dev/null +++ b/main.js @@ -0,0 +1,143 @@ +const puppeteer = require('puppeteer'); +const readline = require('readline'); +const fs = require('fs'); +const needle = require('needle'); +const { execSync } = require('child_process'); + +async function readLine(label, muted=false) { + const rl = readline.createInterface({ + input: process.stdin, + output: process.stdout + }); + rl.stdoutMuted = muted; + rl._writeToOutput = function _writeToOutput(str) { + if (rl.stdoutMuted) + rl.output.write("*"); + else + rl.output.write(str); + }; + return new Promise(resolve => { + rl.question(label + ': ', (answer) => { + rl.close(); + resolve(answer); + }); + }); +} + +(async () => { + const browser = await puppeteer.launch({headless: true}); + const page = await browser.newPage(); + page.setViewport({width: 1366, height: 768}); + await page.goto("https://bibliotheque.insa-lyon.fr/parcours-recherche/panorama_des_ressources"); + await Promise.all([ + // accéder à mon compte + page.click('#boite_13 > a'), + page.waitForNavigation(), + ]); + await Promise.all([ + // s'identifier + page.click('#boite_23 > div.widget-body.card-body > div > button'), + page.waitForNavigation(), + ]); + await page.click('#username'); + const username = await readLine('login'); + await page.keyboard.type(username); + await page.click('#password'); + console.log('password:'); + const password = await readLine('', true); + await page.keyboard.type(password); + await Promise.all([ + // soumission formulaire identification CAS + page.click('#fm1 > div.row.btn-row > input.btn-submit'), + page.waitForNavigation(), + ]); + await page.goto("https://docelec.insa-lyon.fr/login?url=http://nouveau.europresse.com/access/ip/default.aspx?un=INSAT_3"); + await page.goto("https://nouveau-europresse-com.docelec.insa-lyon.fr/webpages/Pdf/SearchForm.aspx"); + const journalNames = await page.evaluate(() => { + let journalNames = []; + document.querySelectorAll("#lbSources option").forEach((item) => { + journalNames.push(item.value+' : '+item.innerText); + }); + return journalNames; + }); + for (let journalName of journalNames) console.log(journalName); + const journalName = await readLine('newspaper id'); // "EC_P", "LF_P" + await page.select("select#lbSources", journalName); + await Promise.all([ + // ouverture popup pdf + page.click('#btnSearch'), + page.waitForNavigation(), + ]); + await page.waitForTimeout(2000); + const pages = await browser.pages(); + const pop = pages[pages.length - 1]; + await pop.setRequestInterception(true); + countRequest = (() => { + let count = 0; + return async request => { + if(request.url().indexOf('DocName=pdf')>0) { + const options = { headers: request._headers, output: count+'.pdf' }; + const cookies = await pop.cookies(); + options.headers.Cookie = cookies.map(ck => + ck.name+'='+ck.value).join(';'); + needle.get(request._url, options); + count++; + } else { + request.continue(); + } + }; + })(); + pop.on('request', countRequest); + const frameList = pop.frames().find((frame) => frame.name() === 'ListDoc'); + await frameList.waitForSelector('#listdoc'); + const pdfIds = await frameList.evaluate(() => { + pdfIds = []; + document.querySelectorAll("#listdoc a").forEach((item) => { + pdfIds.push('#'+item.id); + }); + return pdfIds; + }); + //console.log(JSON.stringify(pdfIds, null, 4)); + let nbPages = 0; + for (let id of pdfIds) { + console.log('page: ' + id); + await Promise.all([ + frameList.click(id), + frameList.waitForTimeout(1000), + ]); + nbPages++; + } + await browser.close(); + + let pdfFilenames = []; + for (let i=0; i { + if (error) { + console.log(`error: ${error.message}`); + } + if (stderr) { + console.log(`stderr: ${stderr}`); + } + console.log(`stdout: ${stdout}`); + }); + + for (let filename of pdfFilenames) { + try { + fs.unlinkSync(filename); + } catch (err) { + console.error(err); + } + } +})(); diff --git a/package.json b/package.json new file mode 100644 index 0000000..6b6e787 --- /dev/null +++ b/package.json @@ -0,0 +1,15 @@ +{ + "name": "scrapnews", + "version": "1.0.0", + "description": "scrap pdf of daily journals from europress", + "main": "main.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "author": "p6e7p7", + "license": "ISC", + "dependencies": { + "needle": "^3.0.0", + "puppeteer": "^13.0.0" + } +}