12 November 2022 EU SPb meetup

Саша Вильховенко, “Web scraping”

Материалы митапа

Презентация

Код с митапа

library(rvest)
library(purrr)
library(xml2)
library(stringr)
library(dplyr)

url <- "https://fanfics.me/fandom46/heroes"

page <- read_html(url)


group_hero <- page %>%
  html_nodes("#data-container .ContentTable a")%>%
  html_attr("href")%>%
  paste("https://fanfics.me", ., sep = "")


get_bio <- function(group_hero) {
  link <- read_html(group_hero)
  group_character <- link %>% html_nodes("h1") %>% html_text()
  bio <- link %>% html_nodes(".light+ .text , a+ .text") %>% html_text()
  name <- link %>%
    html_nodes(".text a") %>%
    html_text()
  data <- tibble(name, bio, group_character)
}

parse_chapter_page <- function(link) {
  message('Parsing URL: ', link)
  link <- read_html(link)
  group_character <- link %>% html_nodes("h1") %>% html_text()
  bio <- link %>% html_nodes(".light+ .text , a+ .text") %>% html_text()
  name <- link %>%
    html_nodes(".text a") %>%
    html_text()
  tibble(name, bio, group_character)
}

list <- lapply(group_hero, FUN = parse_chapter_page)

data <- bind_rows(list)

for (page in seq(1, 4, 1)) {
  base <- paste0("https://fanfics.me/fandom46/heroes/group101?page=", page)
  message('Parsing URL: ', base)
  link <- read_html(base)
  group_character <- link %>% html_nodes("h1") %>% html_text()
  bio <- link %>% html_nodes(".light+ .text , a+ .text") %>% html_text()
  name <- link %>%
    html_nodes(".text a") %>%
    html_text()
  tibble(name, bio, group_character)
}

list <- lapply(group_hero, FUN = parse_chapter_page)

data <- bind_rows(list)