Саша Вильховенко, “Web scraping”
Код с митапа
library(rvest)
library(purrr)
library(xml2)
library(stringr)
library(dplyr)
url <- "https://fanfics.me/fandom46/heroes"
page <- read_html(url)
group_hero <- page %>%
html_nodes("#data-container .ContentTable a")%>%
html_attr("href")%>%
paste("https://fanfics.me", ., sep = "")
get_bio <- function(group_hero) {
link <- read_html(group_hero)
group_character <- link %>% html_nodes("h1") %>% html_text()
bio <- link %>% html_nodes(".light+ .text , a+ .text") %>% html_text()
name <- link %>%
html_nodes(".text a") %>%
html_text()
data <- tibble(name, bio, group_character)
}
parse_chapter_page <- function(link) {
message('Parsing URL: ', link)
link <- read_html(link)
group_character <- link %>% html_nodes("h1") %>% html_text()
bio <- link %>% html_nodes(".light+ .text , a+ .text") %>% html_text()
name <- link %>%
html_nodes(".text a") %>%
html_text()
tibble(name, bio, group_character)
}
list <- lapply(group_hero, FUN = parse_chapter_page)
data <- bind_rows(list)
for (page in seq(1, 4, 1)) {
base <- paste0("https://fanfics.me/fandom46/heroes/group101?page=", page)
message('Parsing URL: ', base)
link <- read_html(base)
group_character <- link %>% html_nodes("h1") %>% html_text()
bio <- link %>% html_nodes(".light+ .text , a+ .text") %>% html_text()
name <- link %>%
html_nodes(".text a") %>%
html_text()
tibble(name, bio, group_character)
}
list <- lapply(group_hero, FUN = parse_chapter_page)
data <- bind_rows(list)