Escribo una URL y obtengo una lista anidada. En el siguiente paso, quiero convertir esto en un marco de datos. Pero mi solución no funciona. Creo que el problema es que las listas de listas tienen nombres y no puedo eliminarlas antes de convertir. Espero que tengas algunas pistas.

require(tidyverse)
require(rvest)

#create a list, which looks like this what I get after webscrape ( my code for scraping is at the end)
item1 <- data.frame(id_course1 = c("id_course1", "id_course1"),
                    course1 = c("participants", 15),
                    course1 = c("mark1", 1),
                    course1 = c("mark2", 2),
                    course1 = c("mark3", 3),
                    course1 = c("mark4", 4),
                    course1 = c("mark5", 5))
item2 <- data.frame(id_course2 = c("id_course2", "id_course2"),
                    course2 = c("participants", 30),
                    course2 = c("mark1", 10),
                    course2 = c("mark2", 8),
                    course2 = c("mark3", 6),
                    course2 = c("mark4", 4),
                    course2 = c("mark5", 2))
item3 <- data.frame(id_course3 = c("id_course3", "id_course3"),
                    course3 = c("participants", 15),
                    course3 = c("mark1", 2),
                    course3 = c("mark2", 4),
                    course3 = c("mark3", 5),
                    course3 = c("mark4", 3),
                    course3 = c("mark5", 1))
my.list <- list(item1, item2, item3)

#create dataframe, but the result is not what I want
require(data.table)
data.table::rbindlist(my.list, fill=TRUE)
dplyr::bind_rows(my.list)
dplyr::bind_rows(unname(my.list))

# try to use only the second row of the table, but the result is not what I want
do.call("cbind", lapply(my.list, "[[", 2) )
do.call("rbind", lapply(my.list, "[[", 2) )
lapply(my.list, "[[", 2) %>% dplyr::bind_rows

#at the end I want a table that looks like this
df_what_i_want <- data.frame(t(data.frame(c("id_course1", 15, 1, 2, 3, 4, 5 ),
                                        c("id_course2", 30, 10, 8, 6, 4, 2 ),
                                        c("id_course3", 15, 2, 4, 5, 3, 1 ))))
rownames(df_what_i_want) <- NULL
colnames(df_what_i_want) <- c("id_course1", "participants", "mark1", "mark2", "mark3", "mark4", "mark5" )


# scrape the website
url <- "https://www.fernuni-hagen.de/wirtschaftswissenschaft/studium/klausurstatistik.shtml"
courses_list <- read_html(url) %>%
  html_nodes("li") %>%
  html_nodes("table") %>%
  html_table(fill = TRUE) 
2
Alexander 26 mar. 2021 a las 12:07

2 respuestas

La mejor respuesta

Esto también serviría

library(janitor)
library(tidyverse)
map_dfr(my.list, ~(as.data.frame(.) %>% janitor::row_to_names(1) %>% setNames(my.list[[1]][1,])))

  id_course1 participants mark1 mark2 mark3 mark4 mark5
1 id_course1           15     1     2     3     4     5
2 id_course2           30    10     8     6     4     2
3 id_course3           15     2     4     5     3     1
1
AnilGoyal 26 mar. 2021 a las 10:05

Creo que sería mejor si corrigiera el código mientras se raspa. Prueba esto :

library(rvest)

url <- "https://www.fernuni-hagen.de/wirtschaftswissenschaft/studium/klausurstatistik.shtml"

read_html(url) %>%
  html_nodes("li") %>%
  html_nodes("table") %>%
  head %>% #remove this later
  html_table(fill = TRUE)  %>%
  purrr::map_df(~.x %>% setNames(.[1, ]) %>% slice(-1)) -> result

result
1
Ronak Shah 26 mar. 2021 a las 09:21