【发布时间】:2022-01-21 05:41:23
【问题描述】:
我是 R 新手,希望能得到一些帮助。我正在尝试从网站上抓取有关犬种的数据。
品种列表的链接在这里: https://dogtime.com/dog-breeds/profiles
每个品种资料的网址都以https://dogtime.com/dog-breeds/ 为基础,然后添加品种名称(例如https://dogtime.com/dog-breeds/golden-retriever)。
我已使用以下代码成功抓取了一个品种的数据,但我现在想收集网站上所有 392 个品种的数据并将结果存储在数据框中。
library(rvest)
library(dplyr)
library(purrr)
# Create a vector of URLs
dog_links <- page %>% html_nodes(".list-item-title") %>%
html_attr("href")
# Create a new variable for the website link
link = "https://dogtime.com/dog-breeds/golden-retriever"
# Get HTML code from this website
page <- read_html(link)
# Create variables for each of the attributes
breed <- page %>% html_nodes("h1") %>% html_text()
adaptability = page %>% html_nodes(".title-box+ .paws .parent-characteristic .characteristic-star-block") %>% html_text()
apartment_living = page %>% html_nodes(".title-box+ .paws .parent-characteristic+ .child-characteristic .characteristic-star-block") %>% html_text()
novice_owners = page %>% html_nodes(".title-box+ .paws .child-characteristic:nth-child(3) .characteristic-star-block") %>% html_text()
sensitivity_level = page %>% html_nodes(".title-box+ .paws .child-characteristic:nth-child(4) .characteristic-star-block") %>% html_text()
tolerates_alone = page %>% html_nodes(".title-box+ .paws .child-characteristic:nth-child(5) .characteristic-star-block") %>% html_text()
tolerates_cold = page %>% html_nodes(".title-box+ .paws .child-characteristic:nth-child(6) .characteristic-star-block") %>% html_text()
tolerates_hot = page %>% html_nodes(".title-box+ .paws .child-characteristic:nth-child(7) .characteristic-star-block") %>% html_text()
friendliness = page %>% html_nodes(".paws:nth-child(3) .parent-characteristic .characteristic-star-block") %>% html_text()
affectionate = page %>% html_nodes(".paws:nth-child(3) .parent-characteristic+ .child-characteristic .characteristic-star-block") %>% html_text()
kid_friendly = page %>% html_nodes(".paws:nth-child(3) .child-characteristic:nth-child(3) .characteristic-star-block") %>% html_text()
dog_friendly = page %>% html_nodes(".paws:nth-child(3) .child-characteristic:nth-child(4) .characteristic-star-block") %>% html_text()
stranger_friendly = page %>% html_nodes(".paws:nth-child(3) .child-characteristic:nth-child(5) .characteristic-star-block") %>% html_text()
health_grooming = page %>% html_nodes(".paws:nth-child(4) .parent-characteristic .characteristic-star-block") %>% html_text()
shedding = page %>% html_nodes(".paws:nth-child(4) .parent-characteristic+ .child-characteristic .characteristic-star-block") %>% html_text()
drooling = page %>% html_nodes(".paws:nth-child(4) .child-characteristic:nth-child(3) .characteristic-star-block") %>% html_text()
easy_groom = page %>% html_nodes(".paws:nth-child(4) .child-characteristic:nth-child(4) .characteristic-star-block") %>% html_text()
general_health = page %>% html_nodes(".paws:nth-child(4) .child-characteristic:nth-child(5) .characteristic-star-block") %>% html_text
weight_gain = page %>% html_nodes(".paws:nth-child(4) .child-characteristic:nth-child(6) .characteristic-star-block") %>% html_text()
size = page %>% html_nodes(".paws:nth-child(4) .child-characteristic:nth-child(7) .characteristic-star-block") %>% html_text()
trainability = page %>% html_nodes("#cf_hagn+ .paws .parent-characteristic .characteristic-star-block") %>% html_text()
easy_train = page %>% html_nodes("#cf_hagn+ .paws .parent-characteristic+ .child-characteristic .characteristic-star-block") %>% html_text()
intelligence = page %>% html_nodes("#cf_hagn+ .paws .child-characteristic:nth-child(3) .characteristic-star-block") %>% html_text()
mouthiness = page %>% html_nodes("#cf_hagn+ .paws .child-characteristic:nth-child(4) .characteristic-star-block") %>% html_text()
prey_drive = page %>% html_nodes("#cf_hagn+ .paws .child-characteristic:nth-child(5) .characteristic-star-block") %>% html_text()
barking = page %>% html_nodes("#cf_hagn+ .paws .child-characteristic:nth-child(6) .characteristic-star-block") %>% html_text()
wanderlust = page %>% html_nodes("#cf_hagn+ .paws .child-characteristic:nth-child(7) .characteristic-star-block") %>% html_text()
physical_needs = page %>% html_nodes("#cf_hagn~ .paws+ .paws .parent-characteristic .characteristic-star-block") %>% html_text()
energy_level = page %>% html_nodes("#cf_hagn~ .paws+ .paws .parent-characteristic+ .child-characteristic .characteristic-star-block") %>% html_text()
intensity = page %>% html_nodes("#cf_hagn~ .paws+ .paws .child-characteristic:nth-child(3) .characteristic-star-block") %>% html_text()
exercise_needs = page %>% html_nodes("#cf_hagn~ .paws+ .paws .child-characteristic:nth-child(4) .characteristic-star-block") %>% html_text()
playfulness = page %>% html_nodes("#cf_hagn~ .paws+ .paws .child-characteristic:nth-child(5) .characteristic-star-block") %>% html_text()
breed_group = page %>% html_nodes(".vital-stat-box:nth-child(1)") %>% html_text()
height = page %>% html_nodes(".vital-stat-box:nth-child(2)") %>% html_text()
weight = page %>% html_nodes(".vital-stat-box:nth-child(3)") %>% html_text()
life_span = page %>% html_nodes(".vital-stat-box:nth-child(4)") %>% html_text()
# Create a data frame
dogs = data.frame(breed, adaptability, apartment_living, novice_owners, sensitivity_level, tolerates_alone, tolerates_cold, tolerates_hot, friendliness, affectionate, kid_friendly, dog_friendly, stranger_friendly, health_grooming, shedding, drooling, easy_groom, general_health, weight_gain, size, trainability, easy_train, intelligence, mouthiness, prey_drive, barking, wanderlust, physical_needs, energy_level, intensity, exercise_needs, playfulness, breed_group, height, weight, life_span, stringsAsFactors = FALSE)
# view data frame
View(dogs)
抱歉,代码中有很多变量要存储。我想我需要使用一个 for 循环来遍历各个品种的每个不同的 url,但我不确定我会如何写这个,因为 'i' 值是字符而不是数字。
谁能告诉我这是否是最好的方法,如果是,我将如何做到这一点?
非常感谢您的帮助,
詹姆斯
【问题讨论】:
标签: r for-loop web-scraping rvest