Scrap all job data from github (https://jobs.github.com) and export in json format

import pdb

import json

import requests

import datetime

from bs4 import BeautifulSoup

page_link = 'https://jobs.github.com/positions?page=0'

page_response = requests.get(page_link, timeout=9999)

page_content = BeautifulSoup(page_response.content, "html.parser")

response = []

for i in range(100):

    print(i)

    page_link = 'https://jobs.github.com/positions?page=' + str(i)

    page_response = requests.get(page_link, timeout=9999)

    page_content = BeautifulSoup(page_response.content, "html.parser")

    data = page_content.find('div', attrs={'id': 'page'})

    if data.find('h1').text.strip() == 'Nothing found':

        break

    data = page_content.findAll('table', attrs={'class': 'positionlist'})

    table_tr = data[0].findAll('tr')

    for row in table_tr:

        response_obj = {}

        row_td = row.find('td', {'class': 'title'})

        try:

            job_title = row_td.find('h4').text

        except:

            break

        company = row_td.find('a', {'class': 'company'}).text

        if 'fulltime' in row_td.find('strong')['class']:

            fulltime = row_td.find('strong', {'class': 'fulltime'}).text

        elif 'parttime' in row_td.find('strong')['class']:

            fulltime = row_td.find('strong', {'class': 'parttime'}).text

        elif 'contract' in row_td.find('strong')['class']:

            fulltime = row_td.find('strong', {'class': 'contract'}).text

        row_td = row.find('td', {'class': 'meta'})

        location = row_td.find('span', {'class': 'location'}).text

        timezone = row_td.find('span', {'class': 'when'}).text

        response_obj.update({'title': job_title,

                            'company': company,

                            'job_type': fulltime,

                            'location': location,

                            'timezone': timezone,

                            })

        response.append(response_obj)

with open('github_response.json', 'w') as outfile:

    json.dump(response, outfile)

Daily Dose - e

Scrap all job data from github (https://jobs.github.com) and export in json format

No comments:

Post a Comment

Featured post

A23 Rummy - Safe Secure Gaming Platform