Saving my progress on job_scraper_v2. This overhaul will separate cURL requests and HTML parsing into distinct stages. Buyer beware: it is far from complete. The flow is: requesting HTML documents first, closing all network connections, parsing HTML, and finally saving to disk.
123 lines
3.7 KiB
Python
123 lines
3.7 KiB
Python
import csv
|
|
import random
|
|
from urllib.parse import urlencode
|
|
from io import BytesIO
|
|
import pycurl
|
|
from bs4 import BeautifulSoup
|
|
import certifi
|
|
|
|
# Specify your search keywords
|
|
QUERY = 'developer'
|
|
JOBQUERY = '(site:breezy.hr OR site:ashbyhq.com OR site:boards.greenhouse.io OR site:jobs.lever.co OR site:jobs.workable.com)' + QUERY
|
|
# Spoof your device
|
|
AGENT = 'user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
|
|
|
|
# Number of hosts chosen at random
|
|
HOST = 1
|
|
# Number of pages requested per host
|
|
PAGE = 1
|
|
|
|
# Customize request headers if you wish
|
|
HEADERS = [
|
|
'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
|
'accept-language: en-US,en;q=0.9',
|
|
'cache-control: max-age=0',
|
|
'content-type: application/x-www-form-urlencoded',
|
|
'cookie: categories=general; language=en-US; locale=en; autocomplete=google; image_proxy=1; method=POST; safesearch=0; theme=simple; results_on_new_tab=0; doi_resolver=oadoi.org; simple_style=auto; center_alignment=0; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=; disabled_plugins=; enabled_plugins=searx.plugins.limiter; tokens=; maintab=on; enginetab=on',
|
|
'origin: null',
|
|
'sec-ch-ua: "Chromium";v="116", "Not-A.Brand";v="24"',
|
|
'sec-ch-ua-mobile: ?0',
|
|
'sec-ch-ua-platform: "Windows"',
|
|
'sec-fetch-dest: document',
|
|
'sec-fetch-mode: navigate',
|
|
'sec-fetch-site: same-origin',
|
|
'sec-fetch-user: ?1',
|
|
'upgrade-insecure-requests: 1',
|
|
'user-agent: ' + AGENT
|
|
]
|
|
|
|
# Populate this list with your favourite searX hosts
|
|
SEARXHOSTS = [
|
|
]
|
|
|
|
# Request a number of pages from the chosen searX host
|
|
def curl_search_jobs(pages, chosenhost):
|
|
request = pycurl.Curl()
|
|
request.setopt(request.FOLLOWLOCATION, 1)
|
|
request.setopt(request.HTTPHEADER, HEADERS)
|
|
request.setopt(request.POST, 1)
|
|
request.setopt(request.USERAGENT, AGENT)
|
|
|
|
buffer = BytesIO()
|
|
|
|
post_data = {
|
|
'q': JOBQUERY,
|
|
'pageno': PAGE
|
|
}
|
|
|
|
POSTFIELDS = urlencode(post_data)
|
|
request.setopt(request.POSTFIELDS, POSTFIELDS)
|
|
|
|
request.setopt(request.URL, CHOSENHOST + '/search')
|
|
request.setopt(request.WRITEDATA, buffer)
|
|
request.setopt(request.CAINFO, certifi.where())
|
|
request.setopt(request.VERBOSE, 0)
|
|
|
|
request.perform()
|
|
print(f"Status {request.getinfo(request.RESPONSE_CODE)}")
|
|
request.close()
|
|
|
|
body = buffer.getvalue()
|
|
return body
|
|
|
|
# Parse BeautifulSoup objects for short descriptions
|
|
def scrape_shortdesc(html):
|
|
for link in html.find_all('h3'):
|
|
print(link.find('a').text)
|
|
|
|
# Parse BeautifulSoup objects for URLs
|
|
def scrape_links(html):
|
|
for link in html.find_all('a', {"class": "url_wrapper"}):
|
|
print(link.get('href'))
|
|
|
|
# Zip two lists together and dump to disk
|
|
def dump_jobs_file(shortdesc, links):
|
|
with open('jobs.csv', 'a', encoding='utf8') as file:
|
|
write = csv.writer(file)
|
|
write.writerows(zip(shortdesc, links))
|
|
file.close()
|
|
|
|
SHORTDESC = []
|
|
LINKS = []
|
|
HTMLOBJECT = []
|
|
|
|
# Choose one or more random hosts and request one or more searches from each
|
|
while HOST > 0:
|
|
CHOSENHOST = random.choice(SEARXHOSTS)
|
|
|
|
while PAGE > 0:
|
|
|
|
CURLBODY = curl_search_jobs(PAGE, CHOSENHOST)
|
|
HTMLOBJECT = BeautifulSoup(CURLBODY, 'html.parser').body
|
|
|
|
PAGE -= 1
|
|
HOST -= 1
|
|
|
|
try:
|
|
SHORTDESC = scrape_shortdesc(HTMLOBJECT)
|
|
LINKS = scrape_links(HTMLOBJECT)
|
|
except Exception as e:
|
|
print(e)
|
|
#print("failed to parse or 429")
|
|
|
|
print(SHORTDESC)
|
|
print(LINKS)
|
|
|
|
#try:
|
|
# dump_jobs_file(SHORTDESC,LINKS)
|
|
#except Exception as e:
|
|
# print(e)
|
|
|
|
|
|
|
|
# print(body.decode('iso-8859-1')) |