library(foreach) library(iterators) library(ggplot2) library(dplyr) #------ # messages <- read.csv2("chat/bus_time_data-1659191816218.csv", sep = ",") # messages <- read.csv2("chat/zeta_molly_data-1659195953292.csv", sep = ",") # messages <- messages %>% mutate(pub_time = as.POSIXct(strptime(pub_time, "%Y-%m-%d %T"))) messages <- read.csv2("transcript/minecraft_data-1659808309634.csv", sep = ",") messages <- messages %>% mutate( start_time = as.POSIXct(strptime(start_time, "%Y-%m-%d %T")), end_time = as.POSIXct(strptime(end_time, "%Y-%m-%d %T")) ) n_m <- nrow(messages) #-------- ggplot(messages, aes(x=pub_time)) + theme_minimal() + geom_histogram(binwidth = 3600) #-------- # Total length of messages len <- as.numeric(messages[n_m, 1] - messages[1,1]) * 24 * 3600 # Poisson lambda for messages per second lambda <- nrow(messages)/len ggplot() + geom_histogram(mapping = aes(x=cumsum(rexp(n_m, lambda))), binwidth = 3600) # Plot with overlaid posterior ggplot(messages, aes(x=pub_time)) + theme_minimal() + geom_histogram(binwidth = 600) + geom_histogram(mapping = aes(x=cumsum(rexp(n_m, lambda)) + messages[1,1]), binwidth = 600, fill="red", alpha=0.3) #------- # Diffs and diff probabilities diffps <- messages %>% transmute(diff = as.numeric(pub_time - lag(pub_time)), p = pexp(as.numeric(pub_time - lag(pub_time)), lambda)) ggplot(diffps, aes(x=diff)) + theme_minimal() + stat_ecdf() # Diff with 0.01 quantile qdiff <- qexp(0.01, lambda) fmess <- messages %>% filter( as.numeric(difftime(pub_time, lag(pub_time), units = "secs")) < qdiff | as.numeric(difftime(lead(pub_time), pub_time, units = "secs")) < qdiff ) #------- # Print messages for(i in 1:nrow(fmess)) { line = fmess[i,] cat(sprintf("[%s] <%s> %s\n", line$pub_time, line$nick, line$message)) if(difftime(fmess[i+1,"pub_time"], line$pub_time, units = "secs") > qdiff) cat("----------------------\n") } # Show where messages were selected ggplot(messages, aes(x=pub_time)) + theme_minimal() + geom_histogram(binwidth = 600) + geom_histogram(data=fmess, binwidth = 600, fill="green", alpha=0.7) #------------- # Bayesian # p(d < 3/60) = 0.01 # d ~ exp(l) # l ~ gamma(a_p, b_p) # l = argmax_l dgamma(a_p, 60) a_p <- 13 b_p <- 60 # Total length of messages # len <- as.numeric(difftime(messages[n_m,]$pub_time, messages[1,]$pub_time, units = "secs")) len <- 749503.076 # Poisson lambda for messages per second library(extraDistr) gen_diffs <- rlomax( n_m, 1/(b_p + len), (a_p + n_m) ) ggplot() + geom_histogram(mapping = aes(x=cumsum(gen_diffs)), binwidth = 3600) # Plot with overlaid posterior ggplot(messages, aes(x=pub_time)) + theme_minimal() + geom_histogram(binwidth = 600) + geom_histogram(mapping = aes(x=cumsum(rexp(n_m, lambda)) + messages[1,1]), binwidth = 600, fill="red", alpha=0.3) #--------------- # Diff with 0.01 quantile qdiff <- qlomax(0.01, 1/(b_p + len), (a_p + n_m)) fmess <- messages %>% filter( as.numeric(difftime(pub_time, lag(pub_time), units = "secs")) < qdiff | as.numeric(difftime(lead(pub_time), pub_time, units = "secs")) < qdiff )