Friday 23 November 2018

How to scrap all job list from https://stackoverflow.com/jobs

'''
 Scrap Stackoverflow all jobs and save response in a file in json format
'''

import json
import requests
import datetime
from bs4 import BeautifulSoup

page_link = 'https://stackoverflow.com/jobs?sort=p'
page_response = requests.get(page_link, timeout=9999)
page_content = BeautifulSoup(page_response.content, "html.parser")

data = page_content.findAll('a', attrs={'class': 'job-link'})[-2]
loop_count = data.find('span').text
response = []

loop_count = 1
for i in range(int(loop_count)):
page_link = 'https://stackoverflow.com/jobs?sort=p&pg='+ str(i)
page_response = requests.get(page_link, timeout=9999)
page_content = BeautifulSoup(page_response.content, "html.parser")
data = page_content.findAll('a', attrs={'class': 's-link__visited'})
for d in data:
job_page_link = 'https://stackoverflow.com' + d['href']
response_obj = {}

job_page_response = requests.get(job_page_link, timeout=9999)

job_page_content = BeautifulSoup(
job_page_response.content, "html.parser")

job_header = job_page_content.findAll('header', {'class': 'job-details--header'})
job_header = job_header[0]
# ----- Job Details ---- #
company_logo = job_header.find('div', {'class': 's-avatar'})
company_logo = company_logo.find('img')
company_logo = company_logo['src']
job_title = job_header.find('h1', {'class': 'fs-headline1'})
job_title = job_title.find('a').text

company_info = job_page_content.find(
'div', attrs={'class': 'fc-black-700 fs-body3'}).text
company_info = company_info.split('\n')

salary = ''
location_remote = ''
location_visa = ''
location_relocation = ''
try:
company_obj_list = job_header.find('div', {'class': 'mt12'})
company_obj_list = company_obj_list.findAll('span')

for company_obj in company_obj_list:
if '-salary' in company_obj['class']:
salary = company_obj.text.strip()
if '-remote' in company_obj['class']:
location_remote = company_obj.text.strip()
if '-visa' in company_obj['class']:
location_visa = company_obj.text.strip()
if '-relocation' in company_obj['class']:
location_relocation = company_obj.text.strip()
except:
pass
date_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
response_obj.update({'timestamp': date_time,
'job_url': job_page_link,
'company_logo': company_logo,
'job_title': job_title,
'company_name': company_info[1],
'company_location': company_info[3],
'salary': salary,
'remote' : location_remote,
'visa' : location_visa,
'relocation': location_relocation})

job_data = job_page_content.find('div', attrs={'id': 'overview-items'})
sections = job_data.findAll('section', {'class': 'mb32'})

# ---- Section One About this job -------- #
section_1 = sections[0]
about_job = section_1.findAll('div', {'class': 'mb8'})

job_headings = section_1.find('h2', {'class': 'fs-subheading'}).text

for ajob in about_job:
headings = ajob.findAll('span')
label = headings[0].text.lstrip().rstrip().replace(':', '').replace(' ', '_').lower()
values = headings[1].text
response_obj.update({label: values})

# ------ Section Two Technologies ----- #
section_2 = sections[1]
job_headings = section_2.find('h2', {'class': 'fs-subheading'}).text

technologies = section_2.findAll('a', {'class': 'job-link'})
tech_stack = ''
for tech in technologies:
tech_stack = tech_stack + tech.text + ', '
response_obj.update({'technologies': tech_stack })
# ------ Section Three Job description ----- #
section_3 = sections[2]

job_headings = section_3.find('h2', {'class': 'fs-subheading'}).text

job_description = section_3.find('div').text

response_obj.update({'job_description': job_description})

response.append(response_obj)

with open('response.json', 'w') as outfile:
json.dump(response, outfile)

No comments:

Post a Comment