import pdb
from string import ascii_lowercase
import requests
import xlsxwriter
from bs4 import BeautifulSoup
row = 0
col = 0
workbook = xlsxwriter.Workbook('users_data.xlsx')
worksheet = workbook.add_worksheet()
worksheet.write(row, col, 'Name')
worksheet.write(row, col + 1, 'Address')
worksheet.write(row, col + 2, 'Country')
worksheet.write(row, col + 3, 'PinCode')
worksheet.write(row, col + 4, 'Phone Number')
worksheet.write(row, col + 5, 'Fax')
worksheet.write(row, col + 6, 'Email ID')
worksheet.write(row, col + 7, 'Web Site')
row += 1
for ascii_char in ascii_lowercase:
try:
page_link = 'http://www.cap-acp.ca/en/search/index.php?page=4&search=' + ascii_char
page_response = requests.get(
page_link, timeout=9999)
page_content = BeautifulSoup(page_response.content, "html.parser")
data = page_content.findAll('div', attrs={'class': 'main'})
for div in data:
links = div.findAll('a')
for a in links:
if a['href'] != 'index.php':
child_page_response = requests.get(
"http://www.cap-acp.ca" + a['href'], timeout=5555)
child_page_content = BeautifulSoup(
child_page_response.content, "html.parser")
new_data = child_page_content.findAll(
'div', attrs={'class': 'panel'})
new_data = new_data[0].text
user = new_data.split('\n')
name = user[2]
address = user[3]
country = user[4]
pin_code = user[5]
phone_number = user[6]
fax = ''
if 'Fax' in user[7]:
fax = user[7]
email = ''
if 'E-mail' in user[8]:
email = user[8]
website = ''
if 'Web site' in user[9]:
website = user[9]
worksheet.write(row, col, name)
worksheet.write(row, col + 1, address)
worksheet.write(row, col + 2, country)
worksheet.write(row, col + 3, pin_code)
worksheet.write(row, col + 4, phone_number)
worksheet.write(row, col + 5, fax)
worksheet.write(row, col + 6, email)
worksheet.write(row, col + 7, website)
row += 1
except:
pass
workbook.close()
No comments:
Post a Comment