Kumar Suresh: How to scrap all job list from https://stackoverflow.com/jobs

'''

 Scrap Stackoverflow all jobs and save response in a file in json format

''' 

import json

import requests

import datetime

from bs4 import BeautifulSoup

page_link = 'https://stackoverflow.com/jobs?sort=p'

page_response = requests.get(page_link, timeout=9999)

page_content = BeautifulSoup(page_response.content, "html.parser")

data = page_content.findAll('a', attrs={'class': 'job-link'})[-2]

loop_count = data.find('span').text

response = []

loop_count = 1

for i in range(int(loop_count)):

    page_link = 'https://stackoverflow.com/jobs?sort=p&pg='+ str(i)

    page_response = requests.get(page_link, timeout=9999)

    page_content = BeautifulSoup(page_response.content, "html.parser")

    data = page_content.findAll('a', attrs={'class': 's-link__visited'})

    for d in data:

        job_page_link = 'https://stackoverflow.com' + d['href']

        response_obj = {}

        job_page_response = requests.get(job_page_link, timeout=9999)

        job_page_content = BeautifulSoup(

            job_page_response.content, "html.parser")

        job_header = job_page_content.findAll('header', {'class': 'job-details--header'})

        job_header = job_header[0]

        # ----- Job Details ---- #

        company_logo = job_header.find('div', {'class': 's-avatar'})

        company_logo = company_logo.find('img')

        company_logo = company_logo['src']

        job_title = job_header.find('h1', {'class': 'fs-headline1'})

        job_title = job_title.find('a').text

        company_info = job_page_content.find(

            'div', attrs={'class': 'fc-black-700 fs-body3'}).text

        company_info = company_info.split('\n')

        salary = ''

        location_remote = ''

        location_visa = ''

        location_relocation = ''

        try:

            company_obj_list = job_header.find('div', {'class': 'mt12'})

            company_obj_list = company_obj_list.findAll('span')

            for company_obj in company_obj_list:

                if '-salary' in company_obj['class']:

                    salary = company_obj.text.strip()

                if '-remote' in company_obj['class']:

                    location_remote = company_obj.text.strip()

                if '-visa' in company_obj['class']:

                    location_visa = company_obj.text.strip()

                if '-relocation' in company_obj['class']:

                    location_relocation = company_obj.text.strip()

        except:

            pass

        date_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

        response_obj.update({'timestamp': date_time,

                             'job_url': job_page_link,

                             'company_logo': company_logo,

                             'job_title': job_title,

                             'company_name': company_info[1],

                             'company_location': company_info[3],

                             'salary': salary,

                             'remote' : location_remote,

                             'visa' : location_visa,

                             'relocation': location_relocation})

        job_data = job_page_content.find('div', attrs={'id': 'overview-items'})

        sections = job_data.findAll('section', {'class': 'mb32'})

        # ---- Section One About this job -------- #

        section_1 = sections[0]

        about_job = section_1.findAll('div', {'class': 'mb8'})

        job_headings = section_1.find('h2', {'class': 'fs-subheading'}).text

        for ajob in about_job:

            headings = ajob.findAll('span')

            label = headings[0].text.lstrip().rstrip().replace(':', '').replace(' ', '_').lower()

            values = headings[1].text

            response_obj.update({label: values})

        # ------ Section Two Technologies ----- #

        section_2 = sections[1]

        job_headings = section_2.find('h2', {'class': 'fs-subheading'}).text

        technologies = section_2.findAll('a', {'class': 'job-link'})

        tech_stack = ''

        for tech in technologies:

            tech_stack = tech_stack + tech.text + ', '

        response_obj.update({'technologies': tech_stack })

        # ------ Section Three Job description ----- #

        section_3 = sections[2]

        job_headings = section_3.find('h2', {'class': 'fs-subheading'}).text

        job_description = section_3.find('div').text

        response_obj.update({'job_description': job_description})

        response.append(response_obj)

with open('response.json', 'w') as outfile:

    json.dump(response, outfile)
Kumar Suresh

Friday 23 November 2018

How to scrap all job list from https://stackoverflow.com/jobs

No comments:

Post a Comment