py-searX-job-scraper/job_scraper.py

import csv
import random
from urllib.parse import urlencode
from io import BytesIO
import pycurl
from bs4 import BeautifulSoup
import certifi

QUERY = 'remote'
JOBQUERY = '(site:breezy.hr OR site:ashbyhq.com OR site:boards.greenhouse.io OR site:jobs.lever.co OR site:jobs.workable.com)' + QUERY
AGENT = 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
HOST = 3
PAGE = 3

HEADERS = [
    'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'accept-language: en-US,en;q=0.9',
    'cache-control: max-age=0',
    'content-type: application/x-www-form-urlencoded',
    'cookie: categories=general; language=en-US; locale=en; autocomplete=google; image_proxy=1; method=POST; safesearch=0; theme=simple; results_on_new_tab=0; doi_resolver=oadoi.org; simple_style=auto; center_alignment=0; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=; disabled_plugins=; enabled_plugins=searx.plugins.limiter; tokens=; maintab=on; enginetab=on',
    'origin: null',
    'sec-ch-ua: "Chromium";v="116", "Not-A.Brand";v="24"',
    'sec-ch-ua-mobile: ?0',
    'sec-ch-ua-platform: "Windows"',
    'sec-fetch-dest: document',
    'sec-fetch-mode: navigate',
    'sec-fetch-site: same-origin',
    'sec-fetch-user: ?1',
    'upgrade-insecure-requests: 1',
    'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
]

#Populate this list with your favourite searX hosts
SEARXHOSTS = [
]

def curl_search_jobs(page, jobgrepurl):
    request = pycurl.Curl()
    request.setopt(request.FOLLOWLOCATION, 1)
    request.setopt(request.HTTPHEADER, HEADERS)
    request.setopt(request.POST, 1)
    request.setopt(request.USERAGENT, AGENT)

    buffer = BytesIO()

    post_data = {
        'q': JOBQUERY,
        'pageno': PAGE
    }

    POSTFIELDS = urlencode(post_data)
    request.setopt(request.POSTFIELDS, POSTFIELDS)

    request.setopt(request.URL, JOBGREPURL + '/search')
    request.setopt(request.WRITEDATA, buffer)
    request.setopt(request.CAINFO, certifi.where())
    # request.setopt(request.VERBOSE, 1)

    request.perform()
    print(f"Status {request.getinfo(request.RESPONSE_CODE)}")
    request.close()

    body = buffer.getvalue()
    return body


def dump_jobs_file(shortdesc, links):
    with open('jobs.csv', 'a', encoding='utf8') as file:
        write = csv.writer(file)
        write.writerows(zip(shortdesc, links))
        file.close()


def scrape_shortdesc(html):
    shortdesc = []
    for article in html.find_all('h3'):
        for highlight in article.find_all('a'):
            shortdesc.append(highlight.text)
    return shortdesc


def scrape_links(html):
    links = []
    for link in html.find_all('a', {"class": "url_wrapper"}):
        links.append(link['href'])
    return links

SCRAPEDDESC = []
SCRAPEDLINKS = []

while HOST > 0:
    JOBGREPURL = random.choice(SEARXHOSTS)

    while PAGE > 0:

        CURLBODY = curl_search_jobs(PAGE, JOBGREPURL)
        PARSEDHTML = BeautifulSoup(CURLBODY, 'html.parser')

        SCRAPEDDESC = scrape_shortdesc(PARSEDHTML)
        SCRAPEDLINKS = scrape_links(PARSEDHTML)
        dump_jobs_file(SCRAPEDDESC, SCRAPEDLINKS)

        PAGE -= 1
    HOST -= 1

# print(body.decode('iso-8859-1'))