Fundamental Limits of Estimating Parameters in a Stochastic Blockmodel

Simulations

Here we are checking to see how the estimate \[ \begin{align} \hat{p} = \frac{\sum_{ijk} B_{ijk}}{\rho_n \sum_{ij} A_{ij}} \end{align} \] fares in simulations.

library(igraph)
library(Matrix)
Nsim <- 10^3
res <- vector(,Nsim)
pp <- 0.4
p <- 0.4
n <- 10^5
rho <- sqrt(log(n))/n
N.all <- rbinom(Nsim, n, pp)
# pb <- txtProgressBar(min = 0, max = Nsim, initial = 0) 
for (i in 1:Nsim) {
  N <- N.all[i]
  c.sizes <- c(N, n-N)
  p.mat <- matrix(c(p*rho,0,0,0), nrow=2)

  g <- sample_sbm(n, p.mat, c.sizes, directed = FALSE, loops = FALSE)
  A <- as_adjacency_matrix(g, type="both", sparse=TRUE)

  sum.A <- sum(A)
  A2 <- A%*%A
  sum.B <- sum(A2) - sum(diag(A2))

  p.hat <- sum.B^2/(sum.A^3*rho)
  res[i] <- p.hat
  # setTxtProgressBar(pb,i)
}
hist(res)

var(res)

[1] 0.0001251491

# sanity checking
# n <- 10
# p.mat <- matrix(c(1,1,1,1), nrow=2)
# c.sizes <- c(10,0)
# g <- sample_sbm(n, p.mat, c.sizes, directed = FALSE, loops = FALSE)
# A <- as_adjacency_matrix(g, type="both", sparse=TRUE)

Let’s turn it into a routine and see what happens when we change \(n\) and \(\rho_n\).

Nsim <- 10^2
pp <- 0.45
p <- 0.7
df <- data.frame(n=numeric(), r=numeric(), v=numeric())
for (nt in c("10^3", "10^4", "10^5")) {
  for (r in c("1", "sqrt(log(n))", "log(sqrt(n))", "log(n)")) {
    n <- eval(parse(text=nt))
    rho <- eval(parse(text=r))/n
    N.all <- rbinom(Nsim, n, pp)
    res <- vector(,Nsim)
    for (i in 1:Nsim) {
      N <- N.all[i]
      c.sizes <- c(N, n-N)
      p.mat <- matrix(c(p*rho,0,0,0), nrow=2)

      g <- sample_sbm(n, p.mat, c.sizes, directed = FALSE, loops = FALSE)
      A <- as_adjacency_matrix(g, type="both", sparse=TRUE)

      sum.A <- sum(A)
      A2 <- A%*%A
      sum.B <- sum(A2) - sum(diag(A2))

      p.hat <- sum.B^2/(sum.A^3*rho)
      res[i] <- p.hat
    }
    df <- rbind(df, data.frame(n=nt, r=r, v=var(res)))
  }
}
xtabs(1/v~n+r, data=df)

n/r	1	sqrt(log(n))	log(sqrt(n))	log(n)
10^3	10.29915	78.17433	113.9508	346.7706
10^4	140.85548	810.61940	2052.9501	5366.9762
10^5	1050.77507	9096.68515	24475.4677	98919.6015

library(ggplot2)
library(tidyr)
Nsim <- 10^2
pp <- 0.45
p <- 0.7
df <- data.frame(n=numeric(), r=numeric(), v=numeric())
for (nt in c("10^3", "10^4", "2*10^4", "5*10^4", "10^5")) {
  for (r in c("1", "sqrt(log(n))", "log(n)")) {
    n <- eval(parse(text=nt))
    rho <- eval(parse(text=r))/n
    N.all <- rbinom(Nsim, n, pp)
    res <- vector(,Nsim)
    for (i in 1:Nsim) {
      N <- N.all[i]
      c.sizes <- c(N, n-N)
      p.mat <- matrix(c(p*rho,0,0,0), nrow=2)

      g <- sample_sbm(n, p.mat, c.sizes, directed = FALSE, loops = FALSE)
      A <- as_adjacency_matrix(g, type="both", sparse=TRUE)

      sum.A <- sum(A)
      A2 <- A%*%A
      sum.B <- sum(A2) - sum(diag(A2))

      p.hat <- sum.B^2/(sum.A^3*rho)
      res[i] <- p.hat
    }
    df <- rbind(df, data.frame(n=nt, r=r, v=var(res)))
  }
}

new.df <- df
new.df$n <- apply(new.df, 1, function(x) eval(parse(text=x[1])))
new.df$n2.rho <- apply(new.df, 1, function(x) {
  n <- as.integer(x[1])
  rho <- eval(parse(text=x[2]))
  1/(n^2*rho)
})
new.df$n.rho <- apply(new.df, 1, function(x) {
  n <- as.integer(x[1])
  rho <- eval(parse(text=x[2]))
  1/(n*rho)
})
new.df$n.n <- apply(new.df, 1, function(x) {
  n <- as.integer(x[1])
  rho <- eval(parse(text=x[2]))
  1/(n)
})
new.df$n.rho2 <- apply(new.df, 1, function(x) {
  n <- as.integer(x[1])
  rho <- eval(parse(text=x[2]))
  1/(n*rho^2)
})
df2 <- gather(new.df, type, value, v:n.rho2)
ggplot(df2, aes(x=n, y=1/value, group=type, color=type)) + geom_point() + geom_line() + facet_grid(.~r) + scale_y_log10() + scale_x_log10() + theme_minimal()

Nsim <- 10^3
pp <- 0.2
p <- 0.1
df <- data.frame(n=numeric(), r=numeric(), v=numeric(), b=numeric(), v2=numeric(), b2=numeric())
for (nt in c("10^4", "10^5", "5*10^5", "2*10^6")) {
  for (r in c("sqrt(log(n))", "log(n)", "n^0.2")) {
    n <- eval(parse(text=nt))
    rho <- eval(parse(text=r))/n
    N.all <- rbinom(Nsim, n, pp)
    res <- vector(,Nsim)
    res2 <- vector(,Nsim)
    for (i in 1:Nsim) {
      N <- N.all[i]
      g <- sample_gnp(N, p*rho)
      A <- as_adjacency_matrix(g, type="both", sparse=TRUE)

      sum.A <- sum(A)
      A2 <- A%*%A
      sum.B <- sum(A2) - sum(diag(A2))

      p.hat <- sum.B^2/(sum.A^3*rho)
      res[i] <- p.hat
      res2[i] <- sum.A/N^2/rho
    }
    df <- rbind(df, data.frame(n=nt, r=r, v=var(res), b=mean(res-p)^2, v2=var(res2), b2=mean(res2-p)^2))
  }
}

new.df <- df
new.df$n <- apply(new.df, 1, function(x) eval(parse(text=x[1])))
new.df$n.rho <- apply(new.df, 1, function(x) {
  n <- as.integer(x[1])
  rho <- eval(parse(text=x[2]))
  1/(n*rho)
})
new.df$n.rho2 <- apply(new.df, 1, function(x) {
  n <- as.integer(x[1])
  rho <- eval(parse(text=x[2]))
  1/(n*rho^2)
})
df2 <- gather(new.df, type, value, v:n.rho2)
ggplot(df2, aes(x=n, y=1/value, group=type, color=type)) + geom_point() + geom_line() + facet_grid(.~r) + scale_y_log10() + scale_x_log10() + theme_minimal()