library(foreach) library(iterators) library(ggplot2) library(dplyr) #------ messages <- read.csv2("transcript/minecraft_data-1659808309634.csv", sep = ",") messages <- messages %>% mutate( start_time = as.POSIXct(strptime(start_time, "%F %T")), end_time = as.POSIXct(strptime(end_time, "%F %T")) ) n_m <- nrow(messages) #-------- ggplot(messages, aes(x=start_time)) + theme_minimal() + geom_histogram(binwidth = 3600) ggplot(messages %>% transmute(tdiff = difftime(start_time, lag(start_time), units = "secs")), aes(x=tdiff)) + theme_minimal() + stat_ecdf() + stat_ecdf(data=data.frame(tdiff=rlomax(n_m*10, 1/(b_p + len), (a_p + n_m))), color="red") ggplot(messages %>% transmute(tdiff = as.numeric(difftime(start_time, lag(start_time)), units = "secs")), aes(x=tdiff)) + theme_minimal() + geom_histogram(binwidth = 10) + geom_vline(xintercept = qdiff) ggplot(messages, aes(x=start_time)) + theme_minimal() + geom_vline(aes(xintercept=start_time)) + geom_histogram(binwidth = 3600) ggplot(messages %>% mutate(tdiff = difftime(start_time, lag(start_time), units = "secs")), aes(x=start_time, y=tdiff)) + theme_minimal() + geom_point() + geom_hline(yintercept = qdiff) #------- # Print messages for(i in 1:nrow(messages)) { line = messages[i,] cat(sprintf("[%s] <%s> %s\n", line$start_time, line$names, line$transcription_line)) if(difftime(messages[i+1,"start_time"], line$start_time, units = "secs") > qdiff + 20) cat("----------------------\n") } # Show where messages were selected ggplot(messages, aes(x=pub_time)) + theme_minimal() + geom_histogram(binwidth = 600) + geom_histogram(data=fmess, binwidth = 600, fill="green", alpha=0.7) #------------- # Bayesian # p(d < 3/60) = 0.01 # d ~ exp(l) # l ~ gamma(a_p, b_p) # l = argmax_l dgamma(a_p, 60) a_p <- 13 b_p <- 60 # Total length of messages # len <- as.numeric(difftime(messages[n_m,]$pub_time, messages[1,]$pub_time, units = "secs")) len <- 697666.923 # Poisson lambda for messages per second library(extraDistr) gen_diffs <- rlomax( n_m, 1/(b_p + len), (a_p + n_m) ) ggplot() + geom_histogram(mapping = aes(x=cumsum(gen_diffs)), binwidth = 3600) # Plot with overlaid posterior ggplot(messages, aes(x=start_time)) + theme_minimal() + geom_histogram(binwidth = 600) + geom_histogram(mapping = aes(x=cumsum(rexp(n_m, lambda)) + messages[1,"start_time"]), binwidth = 600, fill="red", alpha=0.3) #--------------- # Diff with 0.01 quantile qdiff <- qlomax(0.02, 1/(b_p + len), (a_p + n_m)) fmess <- messages %>% filter( as.numeric(difftime(start_time, lag(start_time), units = "secs")) < qdiff | as.numeric(difftime(lead(start_time), start_time, units = "secs")) < qdiff ) #---------------- ggplot(messages %>% mutate(len = as.double(difftime(end_time, start_time, unit="secs"))), aes(x=len)) + geom_histogram(binwidth = 0.5) ggplot(messages[1:50,], aes(xmin=start_time, xmax=end_time, ymin=-1, ymax = 1)) + theme_minimal() + geom_rect() #--------- last <- 1 for (i in 2:nrow(messages)) { if (messages[i,]$start_time == messages[last,]$end_time) { messages[last,]$transcription_line <- paste(messages[last,]$transcription_line, messages[i,]$transcription_line) messages[last,]$end_time <- messages[i,]$end_time messages[i,]$transcription_line <- NA } else { last <- i } } messages <- messages %>% filter(!is.na(transcription_line)) messages %>% transmute(touch = start_time == lag(end_time))