From da835d4fb03d369b9781203311f6de14e5c04a2c Mon Sep 17 00:00:00 2001
From: mharb <mharb@noreply.localhost>
Date: Fri, 27 Oct 2023 14:16:49 +0000
Subject: [PATCH] Check-in Progress on V2

Saving my progress on job_scraper_v2. This overhaul will separate cURL requests and HTML parsing into distinct stages. Buyer beware: it is far from complete.

The flow is: requesting HTML documents first, closing all network connections, parsing HTML, and finally saving to disk.
---
 job_scraper.py | 85 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 51 insertions(+), 34 deletions(-)

diff --git a/job_scraper.py b/job_scraper.py
index 1b2ac3a..b4850a4 100644
--- a/job_scraper.py
+++ b/job_scraper.py
@@ -6,12 +6,18 @@ import pycurl
 from bs4 import BeautifulSoup
 import certifi
 
-QUERY = 'remote'
+# Specify your search keywords
+QUERY = 'developer'
 JOBQUERY = '(site:breezy.hr OR site:ashbyhq.com OR site:boards.greenhouse.io OR site:jobs.lever.co OR site:jobs.workable.com)' + QUERY
-AGENT = 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
-HOST = 3
-PAGE = 3
+# Spoof your device
+AGENT = 'user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
 
+# Number of hosts chosen at random
+HOST = 1
+# Number of pages requested per host
+PAGE = 1
+
+# Customize request headers if you wish
 HEADERS = [
     'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
     'accept-language: en-US,en;q=0.9',
@@ -27,14 +33,15 @@ HEADERS = [
     'sec-fetch-site: same-origin',
     'sec-fetch-user: ?1',
     'upgrade-insecure-requests: 1',
-    'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
+    'user-agent: ' + AGENT
 ]
 
-#Populate this list with your favourite searX hosts
+# Populate this list with your favourite searX hosts
 SEARXHOSTS = [
 ]
 
-def curl_search_jobs(page, jobgrepurl):
+# Request a number of pages from the chosen searX host
+def curl_search_jobs(pages, chosenhost):
     request = pycurl.Curl()
     request.setopt(request.FOLLOWLOCATION, 1)
     request.setopt(request.HTTPHEADER, HEADERS)
@@ -51,10 +58,10 @@ def curl_search_jobs(page, jobgrepurl):
     POSTFIELDS = urlencode(post_data)
     request.setopt(request.POSTFIELDS, POSTFIELDS)
 
-    request.setopt(request.URL, JOBGREPURL + '/search')
+    request.setopt(request.URL, CHOSENHOST + '/search')
     request.setopt(request.WRITEDATA, buffer)
     request.setopt(request.CAINFO, certifi.where())
-    # request.setopt(request.VERBOSE, 1)
+    request.setopt(request.VERBOSE, 0)
 
     request.perform()
     print(f"Status {request.getinfo(request.RESPONSE_CODE)}")
@@ -63,44 +70,54 @@ def curl_search_jobs(page, jobgrepurl):
     body = buffer.getvalue()
     return body
 
+# Parse BeautifulSoup objects for short descriptions
+def scrape_shortdesc(html):
+    for link in html.find_all('h3'):
+        print(link.find('a').text)
 
+# Parse BeautifulSoup objects for URLs
+def scrape_links(html):
+    for link in html.find_all('a', {"class": "url_wrapper"}):
+        print(link.get('href'))
+
+# Zip two lists together and dump to disk
 def dump_jobs_file(shortdesc, links):
     with open('jobs.csv', 'a', encoding='utf8') as file:
         write = csv.writer(file)
         write.writerows(zip(shortdesc, links))
         file.close()
 
+SHORTDESC = []
+LINKS = []
+HTMLOBJECT = []
 
-def scrape_shortdesc(html):
-    shortdesc = []
-    for article in html.find_all('h3'):
-        for highlight in article.find_all('a'):
-            shortdesc.append(highlight.text)
-    return shortdesc
-
-
-def scrape_links(html):
-    links = []
-    for link in html.find_all('a', {"class": "url_wrapper"}):
-        links.append(link['href'])
-    return links
-
-SCRAPEDDESC = []
-SCRAPEDLINKS = []
-
+# Choose one or more random hosts and request one or more searches from each
 while HOST > 0:
-    JOBGREPURL = random.choice(SEARXHOSTS)
+    CHOSENHOST = random.choice(SEARXHOSTS)
 
     while PAGE > 0:
         
-        CURLBODY = curl_search_jobs(PAGE, JOBGREPURL)
-        PARSEDHTML = BeautifulSoup(CURLBODY, 'html.parser')
-
-        SCRAPEDDESC = scrape_shortdesc(PARSEDHTML)
-        SCRAPEDLINKS = scrape_links(PARSEDHTML)
-        dump_jobs_file(SCRAPEDDESC, SCRAPEDLINKS)
+        CURLBODY = curl_search_jobs(PAGE, CHOSENHOST)
+        HTMLOBJECT = BeautifulSoup(CURLBODY, 'html.parser').body
 
         PAGE -= 1
     HOST -= 1
 
-# print(body.decode('iso-8859-1'))
+try:
+    SHORTDESC = scrape_shortdesc(HTMLOBJECT)
+    LINKS = scrape_links(HTMLOBJECT)
+except Exception as e:
+    print(e)
+    #print("failed to parse or 429")
+
+print(SHORTDESC)
+print(LINKS)
+
+#try:
+#    dump_jobs_file(SHORTDESC,LINKS)
+#except Exception as e:
+#    print(e)
+
+
+
+# print(body.decode('iso-8859-1'))
\ No newline at end of file