From 94a3e6eebdb61caec363eadebbaa04384bde5880 Mon Sep 17 00:00:00 2001 From: mharb Date: Mon, 31 Jul 2023 14:06:12 +0000 Subject: [PATCH] Upload script prototype. --- job_scraper.py | 106 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 job_scraper.py diff --git a/job_scraper.py b/job_scraper.py new file mode 100644 index 0000000..3d228f7 --- /dev/null +++ b/job_scraper.py @@ -0,0 +1,106 @@ +import csv +import random +from urllib.parse import urlencode +from io import BytesIO +import pycurl +from bs4 import BeautifulSoup +import certifi + +QUERY = 'remote' +JOBQUERY = '(site:breezy.hr OR site:ashbyhq.com OR site:boards.greenhouse.io OR site:jobs.lever.co OR site:jobs.workable.com)' + QUERY +AGENT = 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36' +HOST = 3 +PAGE = 3 + +HEADERS = [ + 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'accept-language: en-US,en;q=0.9', + 'cache-control: max-age=0', + 'content-type: application/x-www-form-urlencoded', + 'cookie: categories=general; language=en-US; locale=en; autocomplete=google; image_proxy=1; method=POST; safesearch=0; theme=simple; results_on_new_tab=0; doi_resolver=oadoi.org; simple_style=auto; center_alignment=0; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=; disabled_plugins=; enabled_plugins=searx.plugins.limiter; tokens=; maintab=on; enginetab=on', + 'origin: null', + 'sec-ch-ua: "Chromium";v="113", "Not-A.Brand";v="24"', + 'sec-ch-ua-mobile: ?0', + 'sec-ch-ua-platform: "Windows"', + 'sec-fetch-dest: document', + 'sec-fetch-mode: navigate', + 'sec-fetch-site: same-origin', + 'sec-fetch-user: ?1', + 'upgrade-insecure-requests: 1', + 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36' +] + +#Populate this list with your favourite searX hosts +SEARXHOSTS = [ +] + +def curl_search_jobs(page, jobgrepurl): + request = pycurl.Curl() + request.setopt(request.FOLLOWLOCATION, 1) + request.setopt(request.HTTPHEADER, HEADERS) + request.setopt(request.POST, 1) + request.setopt(request.USERAGENT, AGENT) + + buffer = BytesIO() + + post_data = { + 'q': JOBQUERY, + 'pageno': PAGE + } + + POSTFIELDS = urlencode(post_data) + request.setopt(request.POSTFIELDS, POSTFIELDS) + + request.setopt(request.URL, JOBGREPURL + '/search') + request.setopt(request.WRITEDATA, buffer) + request.setopt(request.CAINFO, certifi.where()) + # request.setopt(request.VERBOSE, 1) + + request.perform() + print(f"Status {request.getinfo(request.RESPONSE_CODE)}") + request.close() + + body = buffer.getvalue() + return body + + +def dump_jobs_file(shortdesc, links): + with open('jobs.csv', 'a', encoding='utf8') as file: + write = csv.writer(file) + write.writerows(zip(shortdesc, links)) + file.close() + + +def scrape_shortdesc(html): + shortdesc = [] + for article in html.find_all('h3'): + for highlight in article.find_all('a'): + shortdesc.append(highlight.text) + return shortdesc + + +def scrape_links(html): + links = [] + for link in html.find_all('a', {"class": "url_wrapper"}): + links.append(link['href']) + return links + +SCRAPEDDESC = [] +SCRAPEDLINKS = [] + +while HOST > 0: + JOBGREPURL = random.choice(SEARXHOSTS) + + while PAGE > 0: + + CURLBODY = curl_search_jobs(PAGE, JOBGREPURL) + PARSEDHTML = BeautifulSoup(CURLBODY, 'html.parser') + + SCRAPEDDESC = scrape_shortdesc(PARSEDHTML) + SCRAPEDLINKS = scrape_links(PARSEDHTML) + dump_jobs_file(SCRAPEDDESC, SCRAPEDLINKS) + + PAGE -= 1 + HOST -= 1 + +# print(body.decode('iso-8859-1'))