transcript study

3 years ago · 60012b90e2
parent 2b2806457e
commit 60012b90e2
4 changed files with 344 additions and 3 deletions
--- a/data/chat_freq_study.R
+++ b/data/chat_freq_study.R
@ -6,8 +6,9 @@ library(dplyr)
 #------

 # messages <- read.csv2("chat/bus_time_data-1659191816218.csv", sep = ",")
-messages <- read.csv2("chat/zeta_molly_data-1659195953292.csv", sep = ",")
-messages <- messages %>% mutate(pub_time = as.POSIXct(strptime(pub_time, "%Y-%m-%d %T")))
+# messages <- read.csv2("chat/zeta_molly_data-1659195953292.csv", sep = ",")
+
+# messages <- messages %>% mutate(pub_time = as.POSIXct(strptime(pub_time, "%Y-%m-%d %T")))

 n_m <- nrow(messages)
  
@ -75,7 +76,8 @@ a_p <- 13
 b_p <- 60

 # Total length of messages
-len <- as.numeric(difftime(messages[n_m,]$pub_time, messages[1,]$pub_time, units = "secs"))
+# len <- as.numeric(difftime(messages[n_m,]$pub_time, messages[1,]$pub_time, units = "secs"))
+len <- 749503.076
 # Poisson lambda for messages per second

 library(extraDistr)
--- a/data/t.R
+++ b/data/t.R
@ -0,0 +1,112 @@
+library(foreach)
+library(iterators)
+library(ggplot2)
+library(dplyr)
+
+#------
+
+# messages <- read.csv2("chat/bus_time_data-1659191816218.csv", sep = ",")
+# messages <- read.csv2("chat/zeta_molly_data-1659195953292.csv", sep = ",")
+
+# messages <- messages %>% mutate(pub_time = as.POSIXct(strptime(pub_time, "%Y-%m-%d %T")))
+
+messages <- read.csv2("transcript/minecraft_data-1659808309634.csv", sep = ",")
+messages <- messages %>% mutate(
+  start_time = as.POSIXct(strptime(start_time, "%Y-%m-%d %T")),
+  end_time = as.POSIXct(strptime(end_time, "%Y-%m-%d %T"))
+)
+
+n_m <- nrow(messages)
+
+#--------
+
+ggplot(messages, aes(x=pub_time)) + theme_minimal() + geom_histogram(binwidth = 3600)
+
+#--------
+
+# Total length of messages
+len <- as.numeric(messages[n_m, 1] - messages[1,1]) * 24 * 3600
+# Poisson lambda for messages per second
+lambda <- nrow(messages)/len
+
+ggplot() + geom_histogram(mapping = aes(x=cumsum(rexp(n_m, lambda))), binwidth = 3600)
+
+# Plot with overlaid posterior
+ggplot(messages, aes(x=pub_time)) + theme_minimal() + 
+  geom_histogram(binwidth = 600) + 
+  geom_histogram(mapping = aes(x=cumsum(rexp(n_m, lambda)) + messages[1,1]), binwidth = 600, fill="red", alpha=0.3)
+
+#-------
+
+# Diffs and diff probabilities
+diffps <- messages %>% transmute(diff = as.numeric(pub_time - lag(pub_time)), p = pexp(as.numeric(pub_time - lag(pub_time)), lambda))
+
+ggplot(diffps, aes(x=diff)) + theme_minimal() +
+  stat_ecdf()
+
+# Diff with 0.01 quantile
+qdiff <- qexp(0.01, lambda)
+
+fmess <- messages %>% filter(
+  as.numeric(difftime(pub_time, lag(pub_time), units = "secs")) < qdiff |
+    as.numeric(difftime(lead(pub_time), pub_time, units = "secs")) < qdiff
+)
+
+#-------
+
+# Print messages
+
+for(i in 1:nrow(fmess)) {
+  line = fmess[i,]
+  
+  cat(sprintf("[%s] <%s> %s\n", line$pub_time, line$nick, line$message))  
+  
+  if(difftime(fmess[i+1,"pub_time"], line$pub_time, units = "secs") > qdiff)
+    cat("----------------------\n")
+}
+
+# Show where messages were selected
+ggplot(messages, aes(x=pub_time)) + theme_minimal() + 
+  geom_histogram(binwidth = 600) + 
+  geom_histogram(data=fmess, binwidth = 600, fill="green", alpha=0.7)
+
+#-------------
+
+# Bayesian
+# p(d < 3/60) = 0.01
+# d ~ exp(l)
+# l ~ gamma(a_p, b_p)
+# l = argmax_l dgamma(a_p, 60)
+
+a_p <- 13
+b_p <- 60
+
+# Total length of messages
+# len <- as.numeric(difftime(messages[n_m,]$pub_time, messages[1,]$pub_time, units = "secs"))
+len <- 749503.076
+# Poisson lambda for messages per second
+
+library(extraDistr)
+
+gen_diffs <- rlomax(
+  n_m, 
+  1/(b_p + len),
+  (a_p + n_m)
+)
+
+ggplot() + geom_histogram(mapping = aes(x=cumsum(gen_diffs)), binwidth = 3600)
+
+# Plot with overlaid posterior
+ggplot(messages, aes(x=pub_time)) + theme_minimal() + 
+  geom_histogram(binwidth = 600) + 
+  geom_histogram(mapping = aes(x=cumsum(rexp(n_m, lambda)) + messages[1,1]), binwidth = 600, fill="red", alpha=0.3)
+
+#---------------
+
+# Diff with 0.01 quantile
+qdiff <- qlomax(0.01,   1/(b_p + len), (a_p + n_m))
+
+fmess <- messages %>% filter(
+  as.numeric(difftime(pub_time, lag(pub_time), units = "secs")) < qdiff |
+    as.numeric(difftime(lead(pub_time), pub_time, units = "secs")) < qdiff
+)
--- a/data/transcript/minecraft_data-1659808309634.csv
+++ b/data/transcript/minecraft_data-1659808309634.csv
--- a/data/transcript_freq_study.R
+++ b/data/transcript_freq_study.R
@ -0,0 +1,80 @@
+library(foreach)
+library(iterators)
+library(ggplot2)
+library(dplyr)
+
+#------
+messages <- read.csv2("transcript/minecraft_data-1659808309634.csv", sep = ",")
+messages <- messages %>% mutate(
+  start_time = as.POSIXct(strptime(start_time, "%Y-%m-%d %T")),
+  end_time = as.POSIXct(strptime(end_time, "%Y-%m-%d %T"))
+)
+
+n_m <- nrow(messages)
+
+#--------
+
+ggplot(messages, aes(x=start_time)) + theme_minimal() + geom_histogram(binwidth = 3600)
+ggplot(messages %>% transmute(tdiff = difftime(start_time, lag(start_time), units = "secs")), aes(x=tdiff)) + theme_minimal() + 
+  stat_ecdf() +
+  stat_ecdf(data=data.frame(tdiff=rlomax(n_m*10, 1/(b_p + len),
+                                         (a_p + n_m))), color="red")
+
+#-------
+
+# Print messages
+
+for(i in 1:nrow(fmess)) {
+  line = fmess[i,]
+  
+  cat(sprintf("[%s] <%s> %s\n", line$pub_time, line$nick, line$message))  
+  
+  if(difftime(fmess[i+1,"pub_time"], line$pub_time, units = "secs") > qdiff)
+    cat("----------------------\n")
+}
+
+# Show where messages were selected
+ggplot(messages, aes(x=pub_time)) + theme_minimal() + 
+  geom_histogram(binwidth = 600) + 
+  geom_histogram(data=fmess, binwidth = 600, fill="green", alpha=0.7)
+
+#-------------
+
+# Bayesian
+# p(d < 3/60) = 0.01
+# d ~ exp(l)
+# l ~ gamma(a_p, b_p)
+# l = argmax_l dgamma(a_p, 60)
+
+a_p <- 13
+b_p <- 60
+
+# Total length of messages
+# len <- as.numeric(difftime(messages[n_m,]$pub_time, messages[1,]$pub_time, units = "secs"))
+len <- 697666.923
+# Poisson lambda for messages per second
+
+library(extraDistr)
+
+gen_diffs <- rlomax(
+  n_m, 
+  1/(b_p + len),
+  (a_p + n_m)
+)
+
+ggplot() + geom_histogram(mapping = aes(x=cumsum(gen_diffs)), binwidth = 3600)
+
+# Plot with overlaid posterior
+ggplot(messages, aes(x=start_time)) + theme_minimal() + 
+  geom_histogram(binwidth = 600) + 
+  geom_histogram(mapping = aes(x=cumsum(rexp(n_m, lambda)) + messages[1,"start_time"]), binwidth = 600, fill="red", alpha=0.3)
+
+#---------------
+
+# Diff with 0.01 quantile
+qdiff <- qlomax(0.01,   1/(b_p + len), (a_p + n_m))
+
+fmess <- messages %>% filter(
+  as.numeric(difftime(pub_time, lag(pub_time), units = "secs")) < qdiff |
+    as.numeric(difftime(lead(pub_time), pub_time, units = "secs")) < qdiff
+)