Friday 23 November 2018

Scrap Instagram all users profile data

import pdb
import json
import requests
import datetime
from bs4 import BeautifulSoup
import xlsxwriter
import time

row1 = 0
row = 0
col = 0

workbook = xlsxwriter.Workbook('instagram_users_data.xlsx')
worksheet = workbook.add_worksheet()

worksheet.write(row, col, 'Name')
worksheet.write(row, col + 1, 'Handle')
worksheet.write(row, col + 2, 'Email')
worksheet.write(row, col + 3, 'Category')
worksheet.write(row, col + 4, 'No Of Followers')
worksheet.write(row, col + 5, 'Average Likes')
worksheet.write(row, col + 6, 'Total Posts')

workbook2 = xlsxwriter.Workbook('instagram_all_users_data.xlsx')
worksheet2 = workbook2.add_worksheet()

worksheet2.write(row1, col, 'Name')
worksheet2.write(row1, col + 1, 'Handle')
worksheet2.write(row1, col + 2, 'Email')
worksheet2.write(row1, col + 3, 'Phone Number')
worksheet2.write(row1, col + 4, 'Category')
worksheet2.write(row1, col + 5, 'No Of Followers')
worksheet2.write(row1, col + 6, 'No Of Following')
worksheet2.write(row1, col + 7, 'Average Likes')
worksheet2.write(row1, col + 8, 'Total Posts')
worksheet2.write(row1, col + 9, 'Profile Url')

row += 1
row1 += 1


for i in range(100):
for j in range(10):
try:
print(j)
list_link = 'https://www.instagram.com/directory/profiles/'+ str(i) + '-' +str(j)+ '/'
list_response = requests.get(list_link, timeout=9999)
list_content = BeautifulSoup(list_response.content, "html.parser")

script_list = list_content.findAll('script')
script_list = script_list[3].text
script_list = script_list.replace(';', '')
script_list = script_list.replace('window._sharedData = ', '')
all_data = json.loads(script_list)
all_data = all_data['entry_data']['ProfilesDirectoryPage'][0]['profile_data']['profile_list']
all_data = json.loads(all_data)

for data in all_data:
try:
page_link = 'https://www.instagram.com/' + str(data) + '/'
print(page_link)
page_response = requests.get(page_link, timeout=9999)
page_content = BeautifulSoup(page_response.content, "html.parser")

script_tag = page_content.findAll('script')
script_tag = script_tag[3].text
clean_script = script_tag.replace(';', '')
clean_script = clean_script.replace('window._sharedData = ', '')

try:
json_data = json.loads(clean_script)
no_of_followers = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_followed_by']['count']
is_business_account = phone_number = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['is_business_account']

if is_business_account and no_of_followers > 250 and no_of_followers < 20000 and total_posts > 15:
total_posts = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['count']
name = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['full_name']
handle = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['username']
email = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['business_email']
phone_number = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['business_phone_number']
category = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['business_category_name']
no_of_following = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_follow']['count']
average_likes_per_posts = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges'][0]['node']['edge_liked_by']['count']
average_comments_per_posts = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges'][0]['node']['edge_media_to_comment']['count']
worksheet.write(row, col, name)
worksheet.write(row, col + 1, handle)
worksheet.write(row, col + 2, email)
worksheet.write(row, col + 3, category)
worksheet.write(row, col + 4, no_of_followers)
worksheet.write(row, col + 5, average_likes_per_posts)
worksheet.write(row, col + 6, total_posts)
row += 1
worksheet2.write(row1, col, name)
worksheet2.write(row1, col + 1, handle)
worksheet2.write(row1, col + 2, email)
worksheet2.write(row1, col + 3, phone_number)
worksheet2.write(row1, col + 4, category)
worksheet2.write(row1, col + 5, no_of_followers)
worksheet2.write(row1, col + 6, no_of_following)
worksheet2.write(row1, col + 7, average_likes_per_posts)
worksheet2.write(row1, col + 8, total_posts)
worksheet2.write(row1, col + 9, page_link)
row1 += 1
except:
pass
except:
time.sleep(200)
pass
except:
time.sleep(400)
pass
workbook.close()

No comments:

Post a Comment