In a previous tutorial, we built an Instagram Scraper with Python. This method scrapes meta tags from the post content and these tags had links to the image or video for the post. This method however no longer works. We can however get the same content ( image or video URL ) by appending ?__a=1 to any Instagram post URL. This information is displayed in JSON format. To get the JSON for an Instagram post such as https://www.instagram.com/p/B9X_7yVFt5q, we can use the URL https://www.instagram.com/p/B9X_7yVFt5q/?__a=1
The full source code for scraping an entire Instagram profile is below.
# import selenium,time & urllib modules
from selenium import webdriver
import time, urllib.request, requests, os
# launch Chrome and navigate to Instagram page
driver = webdriver.Chrome()
driver.get("https://www.instagram.com/funnywhimsical/")
# scroll to the bottom of the page
lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
match=False
while(match==False):
lastCount = lenOfPage
time.sleep(3)
lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
if lastCount==lenOfPage:
match=True
# find all links on the page and if they match '/p' append to list named posts
posts = []
links = driver.find_elements_by_tag_name('a')
for link in links:
post = link.get_attribute('href')
if '/p/' in post:
posts.append( post )
print( posts )
# create download directory
if not os.path.exists('Downloads'):
os.makedirs('Downloads')
# get url of image or video
download_url = ''
for post in posts:
headers = {'User-Agent': 'Mozilla'}
r = requests.get('{}?__a=1'.format( post ), headers=headers)
data = r.json()['graphql']['shortcode_media']
shortcode = data['shortcode']
is_video = data['is_video']
if is_video:
download_url = data['video_url']
urllib.request.urlretrieve(download_url, 'Downloads/{}.mp4'.format(shortcode))
else:
download_url = data['display_url']
urllib.request.urlretrieve(download_url, 'Downloads/{}.jpg'.format(shortcode))
print(download_url)