Check-in Progress on V2

Saving my progress on job_scraper_v2. This overhaul will separate cURL requests and HTML parsing into distinct stages. Buyer beware: it is far from complete.

The flow is: requesting HTML documents first, closing all network connections, parsing HTML, and finally saving to disk.
This commit is contained in:
mharb 2023-10-27 14:16:49 +00:00
parent 582467957e
commit da835d4fb0

View File

@ -6,12 +6,18 @@ import pycurl
from bs4 import BeautifulSoup
import certifi
QUERY = 'remote'
# Specify your search keywords
QUERY = 'developer'
JOBQUERY = '(site:breezy.hr OR site:ashbyhq.com OR site:boards.greenhouse.io OR site:jobs.lever.co OR site:jobs.workable.com)' + QUERY
AGENT = 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
HOST = 3
PAGE = 3
# Spoof your device
AGENT = 'user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
# Number of hosts chosen at random
HOST = 1
# Number of pages requested per host
PAGE = 1
# Customize request headers if you wish
HEADERS = [
'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language: en-US,en;q=0.9',
@ -27,14 +33,15 @@ HEADERS = [
'sec-fetch-site: same-origin',
'sec-fetch-user: ?1',
'upgrade-insecure-requests: 1',
'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
'user-agent: ' + AGENT
]
#Populate this list with your favourite searX hosts
# Populate this list with your favourite searX hosts
SEARXHOSTS = [
]
def curl_search_jobs(page, jobgrepurl):
# Request a number of pages from the chosen searX host
def curl_search_jobs(pages, chosenhost):
request = pycurl.Curl()
request.setopt(request.FOLLOWLOCATION, 1)
request.setopt(request.HTTPHEADER, HEADERS)
@ -51,10 +58,10 @@ def curl_search_jobs(page, jobgrepurl):
POSTFIELDS = urlencode(post_data)
request.setopt(request.POSTFIELDS, POSTFIELDS)
request.setopt(request.URL, JOBGREPURL + '/search')
request.setopt(request.URL, CHOSENHOST + '/search')
request.setopt(request.WRITEDATA, buffer)
request.setopt(request.CAINFO, certifi.where())
# request.setopt(request.VERBOSE, 1)
request.setopt(request.VERBOSE, 0)
request.perform()
print(f"Status {request.getinfo(request.RESPONSE_CODE)}")
@ -63,44 +70,54 @@ def curl_search_jobs(page, jobgrepurl):
body = buffer.getvalue()
return body
# Parse BeautifulSoup objects for short descriptions
def scrape_shortdesc(html):
for link in html.find_all('h3'):
print(link.find('a').text)
# Parse BeautifulSoup objects for URLs
def scrape_links(html):
for link in html.find_all('a', {"class": "url_wrapper"}):
print(link.get('href'))
# Zip two lists together and dump to disk
def dump_jobs_file(shortdesc, links):
with open('jobs.csv', 'a', encoding='utf8') as file:
write = csv.writer(file)
write.writerows(zip(shortdesc, links))
file.close()
SHORTDESC = []
LINKS = []
HTMLOBJECT = []
def scrape_shortdesc(html):
shortdesc = []
for article in html.find_all('h3'):
for highlight in article.find_all('a'):
shortdesc.append(highlight.text)
return shortdesc
def scrape_links(html):
links = []
for link in html.find_all('a', {"class": "url_wrapper"}):
links.append(link['href'])
return links
SCRAPEDDESC = []
SCRAPEDLINKS = []
# Choose one or more random hosts and request one or more searches from each
while HOST > 0:
JOBGREPURL = random.choice(SEARXHOSTS)
CHOSENHOST = random.choice(SEARXHOSTS)
while PAGE > 0:
CURLBODY = curl_search_jobs(PAGE, JOBGREPURL)
PARSEDHTML = BeautifulSoup(CURLBODY, 'html.parser')
SCRAPEDDESC = scrape_shortdesc(PARSEDHTML)
SCRAPEDLINKS = scrape_links(PARSEDHTML)
dump_jobs_file(SCRAPEDDESC, SCRAPEDLINKS)
CURLBODY = curl_search_jobs(PAGE, CHOSENHOST)
HTMLOBJECT = BeautifulSoup(CURLBODY, 'html.parser').body
PAGE -= 1
HOST -= 1
# print(body.decode('iso-8859-1'))
try:
SHORTDESC = scrape_shortdesc(HTMLOBJECT)
LINKS = scrape_links(HTMLOBJECT)
except Exception as e:
print(e)
#print("failed to parse or 429")
print(SHORTDESC)
print(LINKS)
#try:
# dump_jobs_file(SHORTDESC,LINKS)
#except Exception as e:
# print(e)
# print(body.decode('iso-8859-1'))