Check-in Progress on V2
Saving my progress on job_scraper_v2. This overhaul will separate cURL requests and HTML parsing into distinct stages. Buyer beware: it is far from complete. The flow is: requesting HTML documents first, closing all network connections, parsing HTML, and finally saving to disk.
This commit is contained in:
parent
582467957e
commit
da835d4fb0
@ -6,12 +6,18 @@ import pycurl
|
||||
from bs4 import BeautifulSoup
|
||||
import certifi
|
||||
|
||||
QUERY = 'remote'
|
||||
# Specify your search keywords
|
||||
QUERY = 'developer'
|
||||
JOBQUERY = '(site:breezy.hr OR site:ashbyhq.com OR site:boards.greenhouse.io OR site:jobs.lever.co OR site:jobs.workable.com)' + QUERY
|
||||
AGENT = 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
|
||||
HOST = 3
|
||||
PAGE = 3
|
||||
# Spoof your device
|
||||
AGENT = 'user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
|
||||
|
||||
# Number of hosts chosen at random
|
||||
HOST = 1
|
||||
# Number of pages requested per host
|
||||
PAGE = 1
|
||||
|
||||
# Customize request headers if you wish
|
||||
HEADERS = [
|
||||
'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||
'accept-language: en-US,en;q=0.9',
|
||||
@ -27,14 +33,15 @@ HEADERS = [
|
||||
'sec-fetch-site: same-origin',
|
||||
'sec-fetch-user: ?1',
|
||||
'upgrade-insecure-requests: 1',
|
||||
'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
|
||||
'user-agent: ' + AGENT
|
||||
]
|
||||
|
||||
#Populate this list with your favourite searX hosts
|
||||
# Populate this list with your favourite searX hosts
|
||||
SEARXHOSTS = [
|
||||
]
|
||||
|
||||
def curl_search_jobs(page, jobgrepurl):
|
||||
# Request a number of pages from the chosen searX host
|
||||
def curl_search_jobs(pages, chosenhost):
|
||||
request = pycurl.Curl()
|
||||
request.setopt(request.FOLLOWLOCATION, 1)
|
||||
request.setopt(request.HTTPHEADER, HEADERS)
|
||||
@ -51,10 +58,10 @@ def curl_search_jobs(page, jobgrepurl):
|
||||
POSTFIELDS = urlencode(post_data)
|
||||
request.setopt(request.POSTFIELDS, POSTFIELDS)
|
||||
|
||||
request.setopt(request.URL, JOBGREPURL + '/search')
|
||||
request.setopt(request.URL, CHOSENHOST + '/search')
|
||||
request.setopt(request.WRITEDATA, buffer)
|
||||
request.setopt(request.CAINFO, certifi.where())
|
||||
# request.setopt(request.VERBOSE, 1)
|
||||
request.setopt(request.VERBOSE, 0)
|
||||
|
||||
request.perform()
|
||||
print(f"Status {request.getinfo(request.RESPONSE_CODE)}")
|
||||
@ -63,44 +70,54 @@ def curl_search_jobs(page, jobgrepurl):
|
||||
body = buffer.getvalue()
|
||||
return body
|
||||
|
||||
# Parse BeautifulSoup objects for short descriptions
|
||||
def scrape_shortdesc(html):
|
||||
for link in html.find_all('h3'):
|
||||
print(link.find('a').text)
|
||||
|
||||
# Parse BeautifulSoup objects for URLs
|
||||
def scrape_links(html):
|
||||
for link in html.find_all('a', {"class": "url_wrapper"}):
|
||||
print(link.get('href'))
|
||||
|
||||
# Zip two lists together and dump to disk
|
||||
def dump_jobs_file(shortdesc, links):
|
||||
with open('jobs.csv', 'a', encoding='utf8') as file:
|
||||
write = csv.writer(file)
|
||||
write.writerows(zip(shortdesc, links))
|
||||
file.close()
|
||||
|
||||
SHORTDESC = []
|
||||
LINKS = []
|
||||
HTMLOBJECT = []
|
||||
|
||||
def scrape_shortdesc(html):
|
||||
shortdesc = []
|
||||
for article in html.find_all('h3'):
|
||||
for highlight in article.find_all('a'):
|
||||
shortdesc.append(highlight.text)
|
||||
return shortdesc
|
||||
|
||||
|
||||
def scrape_links(html):
|
||||
links = []
|
||||
for link in html.find_all('a', {"class": "url_wrapper"}):
|
||||
links.append(link['href'])
|
||||
return links
|
||||
|
||||
SCRAPEDDESC = []
|
||||
SCRAPEDLINKS = []
|
||||
|
||||
# Choose one or more random hosts and request one or more searches from each
|
||||
while HOST > 0:
|
||||
JOBGREPURL = random.choice(SEARXHOSTS)
|
||||
CHOSENHOST = random.choice(SEARXHOSTS)
|
||||
|
||||
while PAGE > 0:
|
||||
|
||||
CURLBODY = curl_search_jobs(PAGE, JOBGREPURL)
|
||||
PARSEDHTML = BeautifulSoup(CURLBODY, 'html.parser')
|
||||
|
||||
SCRAPEDDESC = scrape_shortdesc(PARSEDHTML)
|
||||
SCRAPEDLINKS = scrape_links(PARSEDHTML)
|
||||
dump_jobs_file(SCRAPEDDESC, SCRAPEDLINKS)
|
||||
CURLBODY = curl_search_jobs(PAGE, CHOSENHOST)
|
||||
HTMLOBJECT = BeautifulSoup(CURLBODY, 'html.parser').body
|
||||
|
||||
PAGE -= 1
|
||||
HOST -= 1
|
||||
|
||||
# print(body.decode('iso-8859-1'))
|
||||
try:
|
||||
SHORTDESC = scrape_shortdesc(HTMLOBJECT)
|
||||
LINKS = scrape_links(HTMLOBJECT)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
#print("failed to parse or 429")
|
||||
|
||||
print(SHORTDESC)
|
||||
print(LINKS)
|
||||
|
||||
#try:
|
||||
# dump_jobs_file(SHORTDESC,LINKS)
|
||||
#except Exception as e:
|
||||
# print(e)
|
||||
|
||||
|
||||
|
||||
# print(body.decode('iso-8859-1'))
|
Loading…
Reference in New Issue
Block a user