Necesito procesar un gran conjunto de datos de millones de entradas, con el siguiente formato:
Tabla: Visitas
|----------------|--------------|------------|
| PERSON_ID | DATE | #Clicks |
|----------------|--------------|------------|
| 1 | 2017-05-04 | 4 |
| 1 | 2018-05-04 | 1 |
| 1 | 2016-02-04 | 5 |
| 1 | 2018-05-06 | 7 |
| 2 | 2018-05-04 | 8 |
| 2 | 2018-05-16 | 1 |
| 2 | 2018-01-04 | 1 |
| 2 | 2018-02-04 | 2 |
| ... | ... | ... |
|----------------|--------------|------------|
Quiero contar los clics de cada día + 30 días siguientes.
Datos N = 2,000,000 Personas = 15,000
Iterar sobre cada persona toma alrededor de 1 segundo, esto es disminuir la velocidad. Cualquier consejo sobre cómo ajustar el código sería apreciado.
Ya intenté usar apply / lapply sin gran éxito.
Ejemplo de código: biblioteca (lubridate);
#Initial Data Set
visits <- data.frame(person_id=c(1,1,1,1,2,2,2,2),
date=c(ymd("2017-05-04"),ymd("2018-05-04"),ymd("2016-02-04"),ymd("2018-05-06"),ymd("2018-05-04"),ymd("2018-05-16"),ymd("2018-01-04"),ymd("2018-02-04")),
clicks=c(4,1,5,7,8,1,1,2),
clicks_30days=0)
unique_visitors <- unique(visits$person_id)
#For Each Person
for(person_id in unique_visitors)
{
#Subset person's records and order the, descending
person_visits <- visits[visits$person_id == person_id,]
person_visits <- person_visits[order(person_visits$date),]
#For each visit count the # of clicks of the visit + all visits within visit's date + 30 days
for(i in 1:nrow(person_visits))
{
search_interval <- interval( person_visits$date[i] , person_visits$date[i]+days(30))
#####This is the interesting codeline#####
calc_result <- sum(person_visits$clicks[person_visits$date %within% search_interval])**
##########################################
#save the clicks + 30 days
visits[rownames(person_visits)[i],"clicks_30days"] <- calc_result
}
}
Algo más rápido que esto es realmente muy apreciado.
4 respuestas
Un enfoque data.table
usando uniones no equi:
library(data.table)
setDT(visits)[, clicks_30days :=
visits[.(person_id=person_id, start=date, end=date+30L),
on=.(person_id, date>=start, date<=end), sum(clicks), by=.EACHI]$V1
]
Salida:
person_id date clicks clicks_30days
1: 1 2017-05-04 4 4
2: 1 2018-05-04 1 8
3: 1 2016-02-04 5 5
4: 1 2018-05-06 7 7
5: 2 2018-05-04 8 9
6: 2 2018-05-16 1 1
7: 2 2018-01-04 1 1
8: 2 2018-02-04 2 2
Código de tiempo:
library(data.table)
set.seed(0L)
npers <- 15e3L
ndates <- 150L
visits <- data.frame(person_id=rep(1L:npers, each=ndates),
date=sample(seq(Sys.Date()-5L*365L, Sys.Date(), by="1 day"), npers*ndates, TRUE),
clicks=sample(10, npers*ndates, TRUE))
vi <- visits
mtd0 <- function() {
visits$person_id <- as.integer(visits$person_id) # faster for integers
unique_visitors <- unique(visits$person_id)
# create columns as vectors (accessing elements in loop will be fast)
r <- visits$clicks_30days2 <- 0 # result vector
j <- 1L
person_id <- visits$person_id
CL <- visits$clicks
DATE_as_int <- as.integer(visits$date) # convert dates to integers
for (id in unique_visitors){
x <- person_id == id # indicates current person
dates <- DATE_as_int[x] # take dates of this person
clicks <- CL[x] # clicks of this person
for (i in 1:length(dates)) {
i_date <- dates[i] # take i-th date
ii <- i_date <= dates & dates <= i_date + 30 # test interval
# r[x][i] <- sum(clicks[ii]) # sum
r[j] <- sum(clicks[ii]) # faster using one index
j <- j + 1L
}
}
visits$clicks_30days2 <- r # assigne to results
visits
}
mtd1 <- function() {
setDT(vi)[, clicks_30days :=
vi[.(person_id=person_id, start=date, end=date+30L),
on=.(person_id, date>=start, date<=end), sum(clicks), by=.EACHI]$V1
]
}
library(microbenchmark)
microbenchmark(mtd0(), mtd1(), times=3L)
Tiempos:
Unit: seconds
expr min lq mean median uq max neval cld
mtd0() 144.847468 145.339189 146.358507 145.830910 147.114026 148.397141 3 b
mtd1() 2.367768 2.398254 2.445058 2.428741 2.483703 2.538665 3 a
# creation of interval for each row can be slow
# and this is not needed here
visits$person_id <- as.integer(visits$person_id) # faster for integers
unique_visitors <- unique(visits$person_id)
# create columns as vectors (accessing elements in loop will be fast)
r <- visits$clicks_30days2 <- 0 # result vector
j <- 1L
person_id <- visits$person_id
CL <- visits$clicks
DATE_as_int <- as.integer(visits$date) # convert dates to integers
for (id in unique_visitors){
x <- person_id == id # indicates current person
dates <- DATE_as_int[x] # take dates of this person
clicks <- CL[x] # clicks of this person
for (i in 1:length(dates)) {
i_date <- dates[i] # take i-th date
ii <- i_date <= dates & dates <= i_date + 30 # test interval
# r[x][i] <- sum(clicks[ii]) # sum
r[j] <- sum(clicks[ii]) # faster using one index
j <- j + 1L
}
}
visits$clicks_30days2 <- r # assigne to results
visits
# person_id date clicks clicks_30days clicks_30days2
# 1 1 2017-05-04 4 4 4
# 2 1 2018-05-04 1 8 8
# 3 1 2016-02-04 5 5 5
# 4 1 2018-05-06 7 7 7
# 5 2 2018-05-04 8 9 9
# 6 2 2018-05-16 1 1 1
# 7 2 2018-01-04 1 1 1
# 8 2 2018-02-04 2 2 2
Algunos horarios:
# running on 280000 row data set:
visits2 <- visits2[order(visits2$person_id), ]
# data need to be sorted by person_id for my approach to yield correct results
system.time(rr <- minem(visits2)) # 4.50
system.time(rr2 <- ronak(visits2)) # 25.64
Considere una suma acumulada con vapply
:
visits$date30 <- visits$date + 30
visits$person_id <- as.integer(visits$person_id)
visits$clicks_30days <- vapply(1:nrow(visits), function(i)
with(visits, sum(clicks[(person_id == person_id[i]) &
(date >= date[i] & date <= date30[i])])),
numeric(1))
visits
# person_id date clicks clicks_30days date30
# 1 1 2017-05-04 4 4 2017-06-03
# 2 1 2018-05-04 1 8 2018-06-03
# 3 1 2016-02-04 5 5 2016-03-05
# 4 1 2018-05-06 7 7 2018-06-05
# 5 2 2018-05-04 8 9 2018-06-03
# 6 2 2018-05-16 1 1 2018-06-15
# 7 2 2018-01-04 1 1 2018-02-03
# 8 2 2018-02-04 2 2 2018-03-06
Puede reducir y simplificar el código al agruparlo por person_id
y calcular sum
de clicks
durante un período de 30 días para cada date
.
library(tidyverse)
visits %>%
group_by(person_id) %>%
mutate(clicks_30days = map_dbl(date, ~sum(clicks[date >= . &
date <= (. + 30)])))
# Groups: person_id [2]
# person_id date clicks clicks_30days
# <dbl> <date> <dbl> <dbl>
#1 1 2017-05-04 4 4
#2 1 2018-05-04 1 8
#3 1 2016-02-04 5 5
#4 1 2018-05-06 7 7
#5 2 2018-05-04 8 9
#6 2 2018-05-16 1 1
#7 2 2018-01-04 1 1
#8 2 2018-02-04 2 2
Preguntas relacionadas
Preguntas vinculadas
Nuevas preguntas
r
R es un entorno de software y lenguaje de programación de código abierto y gratuito para computación estadística, bioinformática, visualización y computación en general. Proporcione ejemplos mínimos y reproducibles junto con el resultado deseado. Use dput () para los datos y especifique todos los paquetes no base con llamadas a library (). No incruste imágenes para datos o código, use bloques de código con sangría en su lugar. Para preguntas relacionadas con estadísticas, use https://stats.stackexchange.com.