From da835d4fb03d369b9781203311f6de14e5c04a2c Mon Sep 17 00:00:00 2001 From: mharb Date: Fri, 27 Oct 2023 14:16:49 +0000 Subject: [PATCH] Check-in Progress on V2 Saving my progress on job_scraper_v2. This overhaul will separate cURL requests and HTML parsing into distinct stages. Buyer beware: it is far from complete. The flow is: requesting HTML documents first, closing all network connections, parsing HTML, and finally saving to disk. --- job_scraper.py | 85 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 51 insertions(+), 34 deletions(-) diff --git a/job_scraper.py b/job_scraper.py index 1b2ac3a..b4850a4 100644 --- a/job_scraper.py +++ b/job_scraper.py @@ -6,12 +6,18 @@ import pycurl from bs4 import BeautifulSoup import certifi -QUERY = 'remote' +# Specify your search keywords +QUERY = 'developer' JOBQUERY = '(site:breezy.hr OR site:ashbyhq.com OR site:boards.greenhouse.io OR site:jobs.lever.co OR site:jobs.workable.com)' + QUERY -AGENT = 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36' -HOST = 3 -PAGE = 3 +# Spoof your device +AGENT = 'user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36' +# Number of hosts chosen at random +HOST = 1 +# Number of pages requested per host +PAGE = 1 + +# Customize request headers if you wish HEADERS = [ 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'accept-language: en-US,en;q=0.9', @@ -27,14 +33,15 @@ HEADERS = [ 'sec-fetch-site: same-origin', 'sec-fetch-user: ?1', 'upgrade-insecure-requests: 1', - 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36' + 'user-agent: ' + AGENT ] -#Populate this list with your favourite searX hosts +# Populate this list with your favourite searX hosts SEARXHOSTS = [ ] -def curl_search_jobs(page, jobgrepurl): +# Request a number of pages from the chosen searX host +def curl_search_jobs(pages, chosenhost): request = pycurl.Curl() request.setopt(request.FOLLOWLOCATION, 1) request.setopt(request.HTTPHEADER, HEADERS) @@ -51,10 +58,10 @@ def curl_search_jobs(page, jobgrepurl): POSTFIELDS = urlencode(post_data) request.setopt(request.POSTFIELDS, POSTFIELDS) - request.setopt(request.URL, JOBGREPURL + '/search') + request.setopt(request.URL, CHOSENHOST + '/search') request.setopt(request.WRITEDATA, buffer) request.setopt(request.CAINFO, certifi.where()) - # request.setopt(request.VERBOSE, 1) + request.setopt(request.VERBOSE, 0) request.perform() print(f"Status {request.getinfo(request.RESPONSE_CODE)}") @@ -63,44 +70,54 @@ def curl_search_jobs(page, jobgrepurl): body = buffer.getvalue() return body +# Parse BeautifulSoup objects for short descriptions +def scrape_shortdesc(html): + for link in html.find_all('h3'): + print(link.find('a').text) +# Parse BeautifulSoup objects for URLs +def scrape_links(html): + for link in html.find_all('a', {"class": "url_wrapper"}): + print(link.get('href')) + +# Zip two lists together and dump to disk def dump_jobs_file(shortdesc, links): with open('jobs.csv', 'a', encoding='utf8') as file: write = csv.writer(file) write.writerows(zip(shortdesc, links)) file.close() +SHORTDESC = [] +LINKS = [] +HTMLOBJECT = [] -def scrape_shortdesc(html): - shortdesc = [] - for article in html.find_all('h3'): - for highlight in article.find_all('a'): - shortdesc.append(highlight.text) - return shortdesc - - -def scrape_links(html): - links = [] - for link in html.find_all('a', {"class": "url_wrapper"}): - links.append(link['href']) - return links - -SCRAPEDDESC = [] -SCRAPEDLINKS = [] - +# Choose one or more random hosts and request one or more searches from each while HOST > 0: - JOBGREPURL = random.choice(SEARXHOSTS) + CHOSENHOST = random.choice(SEARXHOSTS) while PAGE > 0: - CURLBODY = curl_search_jobs(PAGE, JOBGREPURL) - PARSEDHTML = BeautifulSoup(CURLBODY, 'html.parser') - - SCRAPEDDESC = scrape_shortdesc(PARSEDHTML) - SCRAPEDLINKS = scrape_links(PARSEDHTML) - dump_jobs_file(SCRAPEDDESC, SCRAPEDLINKS) + CURLBODY = curl_search_jobs(PAGE, CHOSENHOST) + HTMLOBJECT = BeautifulSoup(CURLBODY, 'html.parser').body PAGE -= 1 HOST -= 1 -# print(body.decode('iso-8859-1')) +try: + SHORTDESC = scrape_shortdesc(HTMLOBJECT) + LINKS = scrape_links(HTMLOBJECT) +except Exception as e: + print(e) + #print("failed to parse or 429") + +print(SHORTDESC) +print(LINKS) + +#try: +# dump_jobs_file(SHORTDESC,LINKS) +#except Exception as e: +# print(e) + + + +# print(body.decode('iso-8859-1')) \ No newline at end of file