编辑:2018 年 12 月更新:
自发布此消息以来,Insta 领域的情况发生了变化。这是一个更新后的脚本,它更加 Python 并且更好地利用了 XPATH/CSS 路径。
请注意,要使用此更新后的脚本,您必须安装 explicit 包 (pip install explicit),或将带有 waiter 的每一行转换为纯硒显式等待。
import itertools
from explicit import waiter, XPATH
from selenium import webdriver
def login(driver):
username = "" # <username here>
password = "" # <password here>
# Load page
driver.get("https://www.instagram.com/accounts/login/")
# Login
waiter.find_write(driver, "//div/input[@name='username']", username, by=XPATH)
waiter.find_write(driver, "//div/input[@name='password']", password, by=XPATH)
waiter.find_element(driver, "//div/button[@type='submit']", by=XPATH).click()
# Wait for the user dashboard page to load
waiter.find_element(driver, "//a/span[@aria-label='Find People']", by=XPATH)
def scrape_followers(driver, account):
# Load account page
driver.get("https://www.instagram.com/{0}/".format(account))
# Click the 'Follower(s)' link
# driver.find_element_by_partial_link_text("follower").click()
waiter.find_element(driver, "//a[@href='/instagram/followers/']", by=XPATH).click()
# Wait for the followers modal to load
waiter.find_element(driver, "//div[@role='dialog']", by=XPATH)
# At this point a Followers modal pops open. If you immediately scroll to the bottom,
# you hit a stopping point and a "See All Suggestions" link. If you fiddle with the
# model by scrolling up and down, you can force it to load additional followers for
# that person.
# Now the modal will begin loading followers every time you scroll to the bottom.
# Keep scrolling in a loop until you've hit the desired number of followers.
# In this instance, I'm using a generator to return followers one-by-one
follower_css = "ul div li:nth-child({}) a.notranslate" # Taking advange of CSS's nth-child functionality
for group in itertools.count(start=1, step=12):
for follower_index in range(group, group + 12):
yield waiter.find_element(driver, follower_css.format(follower_index)).text
# Instagram loads followers 12 at a time. Find the last follower element
# and scroll it into view, forcing instagram to load another 12
# Even though we just found this elem in the previous for loop, there can
# potentially be large amount of time between that call and this one,
# and the element might have gone stale. Lets just re-acquire it to avoid
# that
last_follower = waiter.find_element(driver, follower_css.format(follower_index))
driver.execute_script("arguments[0].scrollIntoView();", last_follower)
if __name__ == "__main__":
account = 'instagram'
driver = webdriver.Chrome()
try:
login(driver)
# Print the first 75 followers for the "instagram" account
print('Followers of the "{}" account'.format(account))
for count, follower in enumerate(scrape_followers(driver, account=account), 1):
print("\t{:>3}: {}".format(count, follower))
if count >= 75:
break
finally:
driver.quit()
我做了一个快速的基准测试来显示性能如何随着您尝试以这种方式抓取的关注者越多而呈指数级下降:
$ python example.py
Followers of the "instagram" account
Found 100 followers in 11 seconds
Found 200 followers in 19 seconds
Found 300 followers in 29 seconds
Found 400 followers in 47 seconds
Found 500 followers in 71 seconds
Found 600 followers in 106 seconds
Found 700 followers in 157 seconds
Found 800 followers in 213 seconds
Found 900 followers in 284 seconds
Found 1000 followers in 375 seconds
原帖:
你的问题有点令人困惑。例如,我不太确定“我可以从中抓取并通过所有迭代进行分页”实际上意味着什么。您目前使用什么来抓取和分页?
无论如何,instagram.com/instagram/media/ 与instagram.com/instagram/followers 不是同一类型的端点。 media 端点似乎是一个 REST API,配置为返回一个易于解析的 JSON 对象。
据我所知,followers 端点并不是真正的 RESTful 端点。相反,Instagram AJAX 在您单击“关注者”按钮后将信息发送到页面源(使用 React?)。我认为如果不使用 Selenium 之类的东西,您将无法获得这些信息,Selenium 可以加载/呈现向用户显示关注者的 javascript。
此示例代码将起作用:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def login(driver):
username = "" # <username here>
password = "" # <password here>
# Load page
driver.get("https://www.instagram.com/accounts/login/")
# Login
driver.find_element_by_xpath("//div/input[@name='username']").send_keys(username)
driver.find_element_by_xpath("//div/input[@name='password']").send_keys(password)
driver.find_element_by_xpath("//span/button").click()
# Wait for the login page to load
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.LINK_TEXT, "See All")))
def scrape_followers(driver, account):
# Load account page
driver.get("https://www.instagram.com/{0}/".format(account))
# Click the 'Follower(s)' link
driver.find_element_by_partial_link_text("follower").click()
# Wait for the followers modal to load
xpath = "//div[@style='position: relative; z-index: 1;']/div/div[2]/div/div[1]"
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, xpath)))
# You'll need to figure out some scrolling magic here. Something that can
# scroll to the bottom of the followers modal, and know when its reached
# the bottom. This is pretty impractical for people with a lot of followers
# Finally, scrape the followers
xpath = "//div[@style='position: relative; z-index: 1;']//ul/li/div/div/div/div/a"
followers_elems = driver.find_elements_by_xpath(xpath)
return [e.text for e in followers_elems]
if __name__ == "__main__":
driver = webdriver.Chrome()
try:
login(driver)
followers = scrape_followers(driver, "instagram")
print(followers)
finally:
driver.quit()
由于多种原因,这种方法存在问题,其中最主要的原因是它相对于 API 有多慢。