import csv import random from urllib.parse import urlencode from io import BytesIO import pycurl from bs4 import BeautifulSoup import certifi QUERY = 'remote' JOBQUERY = '(site:breezy.hr OR site:ashbyhq.com OR site:boards.greenhouse.io OR site:jobs.lever.co OR site:jobs.workable.com)' + QUERY AGENT = 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36' HOST = 3 PAGE = 3 HEADERS = [ 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'accept-language: en-US,en;q=0.9', 'cache-control: max-age=0', 'content-type: application/x-www-form-urlencoded', 'cookie: categories=general; language=en-US; locale=en; autocomplete=google; image_proxy=1; method=POST; safesearch=0; theme=simple; results_on_new_tab=0; doi_resolver=oadoi.org; simple_style=auto; center_alignment=0; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=; disabled_plugins=; enabled_plugins=searx.plugins.limiter; tokens=; maintab=on; enginetab=on', 'origin: null', 'sec-ch-ua: "Chromium";v="116", "Not-A.Brand";v="24"', 'sec-ch-ua-mobile: ?0', 'sec-ch-ua-platform: "Windows"', 'sec-fetch-dest: document', 'sec-fetch-mode: navigate', 'sec-fetch-site: same-origin', 'sec-fetch-user: ?1', 'upgrade-insecure-requests: 1', 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36' ] #Populate this list with your favourite searX hosts SEARXHOSTS = [ ] def curl_search_jobs(page, jobgrepurl): request = pycurl.Curl() request.setopt(request.FOLLOWLOCATION, 1) request.setopt(request.HTTPHEADER, HEADERS) request.setopt(request.POST, 1) request.setopt(request.USERAGENT, AGENT) buffer = BytesIO() post_data = { 'q': JOBQUERY, 'pageno': PAGE } POSTFIELDS = urlencode(post_data) request.setopt(request.POSTFIELDS, POSTFIELDS) request.setopt(request.URL, JOBGREPURL + '/search') request.setopt(request.WRITEDATA, buffer) request.setopt(request.CAINFO, certifi.where()) # request.setopt(request.VERBOSE, 1) request.perform() print(f"Status {request.getinfo(request.RESPONSE_CODE)}") request.close() body = buffer.getvalue() return body def dump_jobs_file(shortdesc, links): with open('jobs.csv', 'a', encoding='utf8') as file: write = csv.writer(file) write.writerows(zip(shortdesc, links)) file.close() def scrape_shortdesc(html): shortdesc = [] for article in html.find_all('h3'): for highlight in article.find_all('a'): shortdesc.append(highlight.text) return shortdesc def scrape_links(html): links = [] for link in html.find_all('a', {"class": "url_wrapper"}): links.append(link['href']) return links SCRAPEDDESC = [] SCRAPEDLINKS = [] while HOST > 0: JOBGREPURL = random.choice(SEARXHOSTS) while PAGE > 0: CURLBODY = curl_search_jobs(PAGE, JOBGREPURL) PARSEDHTML = BeautifulSoup(CURLBODY, 'html.parser') SCRAPEDDESC = scrape_shortdesc(PARSEDHTML) SCRAPEDLINKS = scrape_links(PARSEDHTML) dump_jobs_file(SCRAPEDDESC, SCRAPEDLINKS) PAGE -= 1 HOST -= 1 # print(body.decode('iso-8859-1'))