Compare commits
1 Commits
dev
...
v2-overhau
Author | SHA1 | Date | |
---|---|---|---|
da835d4fb0 |
@ -6,12 +6,18 @@ import pycurl
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import certifi
|
import certifi
|
||||||
|
|
||||||
QUERY = 'remote'
|
# Specify your search keywords
|
||||||
|
QUERY = 'developer'
|
||||||
JOBQUERY = '(site:breezy.hr OR site:ashbyhq.com OR site:boards.greenhouse.io OR site:jobs.lever.co OR site:jobs.workable.com)' + QUERY
|
JOBQUERY = '(site:breezy.hr OR site:ashbyhq.com OR site:boards.greenhouse.io OR site:jobs.lever.co OR site:jobs.workable.com)' + QUERY
|
||||||
AGENT = 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
|
# Spoof your device
|
||||||
HOST = 3
|
AGENT = 'user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
|
||||||
PAGE = 3
|
|
||||||
|
|
||||||
|
# Number of hosts chosen at random
|
||||||
|
HOST = 1
|
||||||
|
# Number of pages requested per host
|
||||||
|
PAGE = 1
|
||||||
|
|
||||||
|
# Customize request headers if you wish
|
||||||
HEADERS = [
|
HEADERS = [
|
||||||
'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||||
'accept-language: en-US,en;q=0.9',
|
'accept-language: en-US,en;q=0.9',
|
||||||
@ -27,14 +33,15 @@ HEADERS = [
|
|||||||
'sec-fetch-site: same-origin',
|
'sec-fetch-site: same-origin',
|
||||||
'sec-fetch-user: ?1',
|
'sec-fetch-user: ?1',
|
||||||
'upgrade-insecure-requests: 1',
|
'upgrade-insecure-requests: 1',
|
||||||
'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
|
'user-agent: ' + AGENT
|
||||||
]
|
]
|
||||||
|
|
||||||
#Populate this list with your favourite searX hosts
|
# Populate this list with your favourite searX hosts
|
||||||
SEARXHOSTS = [
|
SEARXHOSTS = [
|
||||||
]
|
]
|
||||||
|
|
||||||
def curl_search_jobs(page, jobgrepurl):
|
# Request a number of pages from the chosen searX host
|
||||||
|
def curl_search_jobs(pages, chosenhost):
|
||||||
request = pycurl.Curl()
|
request = pycurl.Curl()
|
||||||
request.setopt(request.FOLLOWLOCATION, 1)
|
request.setopt(request.FOLLOWLOCATION, 1)
|
||||||
request.setopt(request.HTTPHEADER, HEADERS)
|
request.setopt(request.HTTPHEADER, HEADERS)
|
||||||
@ -51,10 +58,10 @@ def curl_search_jobs(page, jobgrepurl):
|
|||||||
POSTFIELDS = urlencode(post_data)
|
POSTFIELDS = urlencode(post_data)
|
||||||
request.setopt(request.POSTFIELDS, POSTFIELDS)
|
request.setopt(request.POSTFIELDS, POSTFIELDS)
|
||||||
|
|
||||||
request.setopt(request.URL, JOBGREPURL + '/search')
|
request.setopt(request.URL, CHOSENHOST + '/search')
|
||||||
request.setopt(request.WRITEDATA, buffer)
|
request.setopt(request.WRITEDATA, buffer)
|
||||||
request.setopt(request.CAINFO, certifi.where())
|
request.setopt(request.CAINFO, certifi.where())
|
||||||
# request.setopt(request.VERBOSE, 1)
|
request.setopt(request.VERBOSE, 0)
|
||||||
|
|
||||||
request.perform()
|
request.perform()
|
||||||
print(f"Status {request.getinfo(request.RESPONSE_CODE)}")
|
print(f"Status {request.getinfo(request.RESPONSE_CODE)}")
|
||||||
@ -63,44 +70,54 @@ def curl_search_jobs(page, jobgrepurl):
|
|||||||
body = buffer.getvalue()
|
body = buffer.getvalue()
|
||||||
return body
|
return body
|
||||||
|
|
||||||
|
# Parse BeautifulSoup objects for short descriptions
|
||||||
|
def scrape_shortdesc(html):
|
||||||
|
for link in html.find_all('h3'):
|
||||||
|
print(link.find('a').text)
|
||||||
|
|
||||||
|
# Parse BeautifulSoup objects for URLs
|
||||||
|
def scrape_links(html):
|
||||||
|
for link in html.find_all('a', {"class": "url_wrapper"}):
|
||||||
|
print(link.get('href'))
|
||||||
|
|
||||||
|
# Zip two lists together and dump to disk
|
||||||
def dump_jobs_file(shortdesc, links):
|
def dump_jobs_file(shortdesc, links):
|
||||||
with open('jobs.csv', 'a', encoding='utf8') as file:
|
with open('jobs.csv', 'a', encoding='utf8') as file:
|
||||||
write = csv.writer(file)
|
write = csv.writer(file)
|
||||||
write.writerows(zip(shortdesc, links))
|
write.writerows(zip(shortdesc, links))
|
||||||
file.close()
|
file.close()
|
||||||
|
|
||||||
|
SHORTDESC = []
|
||||||
|
LINKS = []
|
||||||
|
HTMLOBJECT = []
|
||||||
|
|
||||||
def scrape_shortdesc(html):
|
# Choose one or more random hosts and request one or more searches from each
|
||||||
shortdesc = []
|
|
||||||
for article in html.find_all('h3'):
|
|
||||||
for highlight in article.find_all('a'):
|
|
||||||
shortdesc.append(highlight.text)
|
|
||||||
return shortdesc
|
|
||||||
|
|
||||||
|
|
||||||
def scrape_links(html):
|
|
||||||
links = []
|
|
||||||
for link in html.find_all('a', {"class": "url_wrapper"}):
|
|
||||||
links.append(link['href'])
|
|
||||||
return links
|
|
||||||
|
|
||||||
SCRAPEDDESC = []
|
|
||||||
SCRAPEDLINKS = []
|
|
||||||
|
|
||||||
while HOST > 0:
|
while HOST > 0:
|
||||||
JOBGREPURL = random.choice(SEARXHOSTS)
|
CHOSENHOST = random.choice(SEARXHOSTS)
|
||||||
|
|
||||||
while PAGE > 0:
|
while PAGE > 0:
|
||||||
|
|
||||||
CURLBODY = curl_search_jobs(PAGE, JOBGREPURL)
|
CURLBODY = curl_search_jobs(PAGE, CHOSENHOST)
|
||||||
PARSEDHTML = BeautifulSoup(CURLBODY, 'html.parser')
|
HTMLOBJECT = BeautifulSoup(CURLBODY, 'html.parser').body
|
||||||
|
|
||||||
SCRAPEDDESC = scrape_shortdesc(PARSEDHTML)
|
|
||||||
SCRAPEDLINKS = scrape_links(PARSEDHTML)
|
|
||||||
dump_jobs_file(SCRAPEDDESC, SCRAPEDLINKS)
|
|
||||||
|
|
||||||
PAGE -= 1
|
PAGE -= 1
|
||||||
HOST -= 1
|
HOST -= 1
|
||||||
|
|
||||||
|
try:
|
||||||
|
SHORTDESC = scrape_shortdesc(HTMLOBJECT)
|
||||||
|
LINKS = scrape_links(HTMLOBJECT)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
#print("failed to parse or 429")
|
||||||
|
|
||||||
|
print(SHORTDESC)
|
||||||
|
print(LINKS)
|
||||||
|
|
||||||
|
#try:
|
||||||
|
# dump_jobs_file(SHORTDESC,LINKS)
|
||||||
|
#except Exception as e:
|
||||||
|
# print(e)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# print(body.decode('iso-8859-1'))
|
# print(body.decode('iso-8859-1'))
|
Loading…
Reference in New Issue
Block a user