Load data

# message = F, warning = F
library(tidyr)
library(dplyr)
library(ggplot2)
library(data.table)
library(knitr)
library(bit64)
library(extrafont)
library(scales)
library(grid)
library(RColorBrewer)
palette <- brewer.pal("Greys", n=9)

Protagonist countries

# component locations raw data (protagonist countries)
protagonists <- read.table('~/galean/scripts/queries/data/componentlocation.tsv', sep='\t', header=T, na.strings = '', stringsAsFactors = T)
prot.dt <- data.table(protagonists)
# write.table(prot.dt[, component_id, country_code], 'scripts/queries/data/country_component.txt', row.names=F)
prot <- prot.dt %>% 
  select(component_id, country_code, frequency) %>% 
  arrange(-frequency)

Interested countries

participants <- read.table('~/galean/scripts/queries/data/participation_data.txt', sep='\t', header=T, stringsAsFactors = F, na.strings = '')
part.tidy <- participants %>% 
  gather(country, tweets, -component_id) %>% 
  mutate(country = (toupper(country))) %>%
  arrange(component_id, -tweets)

part.tidy$country[part.tidy$country == "IN."] <- "IN"
part.tidy$country <- factor(part.tidy$country)
part.tidy <- data.table(part.tidy)

Countries List

countries_list <- data.table(read.table('~/galean/scripts/queries/data/countries.txt', sep='\t', header=F, stringsAsFactors = T))
setnames(countries_list, c("V1", "V2"), c("country.code", "country.name"))

Summaries & filtering

Overall summary:

events <- 25481
tweets <- 193447671
users <- 26127625

summ <- data.frame(events, tweets, users)
kable(summ, row.names = FALSE, format.args=list(big.mark=','))

events	tweets	users
25,481	193,447,671	26,127,625

Protagonist countries:

# component_id: event
# frequency: no. of times the country was mentioned in the tweets of the event
head(prot)

##    component_id country_code frequency
## 1:        17658           US    584530
## 2:        24041           NP    547669
## 3:        17579           FR    349313
## 4:        17406           FR    345206
## 5:        24089           NP    341518
## 6:        24391           GB    245290

summary(prot)

##   component_id    country_code     frequency     
##  Min.   :    1   US     :10162   Min.   :    30  
##  1st Qu.: 7665   GB     : 4015   1st Qu.:   100  
##  Median :15180   IN     : 1561   Median :   285  
##  Mean   :14132   AU     :  974   Mean   :  1918  
##  3rd Qu.:20635   UA     :  921   3rd Qu.:   982  
##  Max.   :25481   RU     :  823   Max.   :584530  
##                  (Other):14161

# ggplot(prot, aes(x=1, y=frequency)) + geom_boxplot()

Interested countries:

head(part.tidy)

##    component_id country tweets
## 1:            1      US    319
## 2:            1      CO     30
## 3:            1      GB     18
## 4:            1      ID      7
## 5:            1      CA      6
## 6:            1      JP      5

summary(part.tidy)

##   component_id      country            tweets         
##  Min.   :    1   AD     :  20066   Min.   :     0.00  
##  1st Qu.: 6638   AE     :  20066   1st Qu.:     0.00  
##  Median :13566   AF     :  20066   Median :     0.00  
##  Mean   :13181   AG     :  20066   Mean   :     6.63  
##  3rd Qu.:19738   AI     :  20066   3rd Qu.:     0.00  
##  Max.   :25481   AL     :  20066   Max.   :108599.00  
##                  (Other):4896104

# ggplot(part.tidy, aes(x=1, y=tweets)) + geom_boxplot()

Remove countries with few protagonized events (< median) and remove events with few tweets (< median):

# prot.events_per_country <- prot[, .N, by=country_code]
# summary(prot.events_per_country$N)
# to_remove <- prot.events_per_country[N < 18.5]$country_code
# prot <- prot[!(country_code %in% to_remove)]

# summary(prot$frequency)
# to_remove <- prot[frequency < 288]$component_id
# prot <- prot[!(component_id %in% to_remove)]

Measure bias

Events being protagonized

prot.per_country <- prot[, .N, by=country_code]
n <- nrow(prot.per_country)
prot.per_country.top25 <- prot.per_country[order(N), ][(n-25):n]

p <- ggplot(prot.per_country.top25, aes(x=factor(country_code, levels=country_code), y=N, label=comma(N))) +
  geom_bar(stat="identity") + coord_flip() + geom_text(size=2.5, hjust=-.1, color=palette[6]) +
  scale_y_continuous(labels=comma, breaks=seq(0, 12000, by=1000), limits = c(0, 12000)) +
  scale_x_discrete(labels=map_country) +
  fte_theme() +  geom_hline(yintercept=0, size=0.4, color="black") +
  xlab("Country") + ylab("Events") 

print(p + ggtitle("Top 25 countries protagonizing events"))

ggsave(paste(c(PDF_PATH, "protagonist-bias.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "protagonist-bias.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)

Events a country is interested in

quantile(part.tidy$tweets, c(.8, .9, .95, .99, .999, .9999))

##      80%      90%      95%      99%    99.9%   99.99% 
##    0.000    2.000    8.000   77.000 1026.000 6053.901

# use .99 percentile
part.tidy.p99 <- part.tidy[tweets >= 77, ]
part.per_country <- part.tidy.p99[, .N, by=country]
n <- nrow(part.per_country)
part.per_country.top25 <- part.per_country[order(N)[(n - 25):n], ]


p <- ggplot(part.per_country.top25, aes(x=factor(country, levels=country), y=N, label=comma(N))) +
  geom_bar(stat="identity") + coord_flip() +
  geom_text(size=2.5, hjust=-.1, color=palette[6]) +
  scale_y_continuous(labels=comma, breaks=seq(0, 12000, by=1000), limits = c(0, 12000)) +
  scale_x_discrete(labels=map_country) +
  fte_theme() +  geom_hline(yintercept=0, size=0.4, color="black") +
  xlab("Country") + ylab("Events")

print(p + ggtitle("Top 25 countries interested in events"))

ggsave(paste(c(PDF_PATH, "interest-bias.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "interest-bias.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)

Tweets per interested country

part.sum_per_country <- part.tidy.p99[, sum(tweets), by=country]
n <- nrow(part.sum_per_country)
part.sum_per_country.top25 <- part.sum_per_country[order(V1)[(n-25):n]]

p <- ggplot(part.sum_per_country.top25, aes(x=factor(country, levels=country), y=V1, label=comma(V1))) +
  geom_bar(stat="identity") + coord_flip() +
  geom_text(size=2.5, hjust=-.1, color=palette[6]) +
  scale_y_continuous(labels=comma, breaks=seq(0, 14e6, by=2e6), limits = c(0, 14e6)) +
  scale_x_discrete(labels=map_country) +
  fte_theme() +  geom_hline(yintercept=0, size=0.4, color="black") +
  xlab("Country") + ylab("Tweets")

print(p + ggtitle("Top 25 countries with tweets"))

ggsave(paste(c(PDF_PATH, "tweets-per-country.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "tweets-per-country.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)

Users per location

For most of the users, it was not possible to identify a location. The rest is distributed mostly among US, GB, CA and ID (Indonesia).

user_countries <- read.table('~/galean/scripts/queries/data/locations_distribution.txt', sep='\t', stringsAsFactors = FALSE, na.strings = "")
user_countries <- user_countries %>%
  mutate(country = toupper(V1),
         frequency = V2) %>%
  select(country, frequency)
user_countries <- data.table(user_countries)
user_countries <- user_countries[order(frequency)]

n <- nrow(user_countries)
user_countries_pl <- user_countries[(n - 25):n]
p <- ggplot(user_countries_pl, aes(x=factor(country, levels=country), y=frequency, label=comma(frequency))) + 
  geom_bar(stat="identity") + coord_flip() + xlab("Country") + ylab("Users") +
  geom_text(size=2.5, hjust=-.1, color=palette[6]) +
  scale_y_continuous(labels=comma, breaks=seq(0, 17.5e6, by=2e6), limits = c(0, 17.5e6)) +
  scale_x_discrete(labels=map_country) +
  fte_theme() +  geom_hline(yintercept=0, size=0.4, color="black") 
  
print(p + ggtitle("Users per country tweeting about events"))

ggsave(paste(c(PDF_PATH, "users-per-country.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "users-per-country.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)

p <- ggplot(user_countries_pl[country != "<NA>"], 
            aes(x=factor(country, levels=country), y=frequency, label=comma(frequency))) + 
  geom_bar(stat="identity") + coord_flip() + xlab("Country") + ylab("Users") +
  geom_text(size=2.5, hjust=-.1, color=palette[6]) +
  scale_y_continuous(labels=comma, breaks=seq(0, 4e6, by=.5e6), limits = c(0, 4e6)) +
  scale_x_discrete(labels=map_country) +
  fte_theme() +  geom_hline(yintercept=0, size=0.4, color="black") 
  
print(p + ggtitle("Users per country tweeting about events"))

ggsave(paste(c(PDF_PATH, "users-per-country-f.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "users-per-country-f.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)

Users related to news accounts

Out of \(26,127,625\) users, \(71.27\%\) of users have posted at least one retweet, and \(13.46\%\) percent have retweeted a news account at least once.

user_news <- fread('~/galean/scripts/queries/data/user_rts_news.txt', 
                   col.names = c("user_id", "total_rts", "news_rt_ed"))

## 
Read 56.5% of 18621165 rows
Read 18621165 rows and 3 (of 3) columns from 0.248 GB file in 00:00:03

Some numbers:

head(user_news)

##       user_id total_rts news_rt_ed
## 1:  469762048        11          0
## 2:  134217729         1          0
## 3: 2896865963         1          0
## 4:   40044386        29          0
## 5: 2531840763         2          0
## 6: 2281701384         1          0

nrow(user_news)

## [1] 18621165

summary(user_news)

##     user_id             total_rts           news_rt_ed       
##  Min.   :        12   Min.   :    1.000   Min.   :    0.000  
##  1st Qu.: 226215392   1st Qu.:    1.000   1st Qu.:    0.000  
##  Median : 541496797   Median :    2.000   Median :    0.000  
##  Mean   :1007747589   Mean   :    5.416   Mean   :    0.769  
##  3rd Qu.:1699603753   3rd Qu.:    3.000   3rd Qu.:    0.000  
##  Max.   :3344323469   Max.   :25954.000   Max.   :17709.000

user_news[which.max(total_rts)]

##       user_id total_rts news_rt_ed
## 1: 1669934971     25954      16586

user_news[which.max(news_rt_ed)]

##      user_id total_rts news_rt_ed
## 1: 275942684     20803      17709

The user 1669934971 corresponds to NewsFeedNetwork. The user 275942684 corresponds to rashidaldosari.

d <- data.frame(total_users=users, 
                users_rt=nrow(user_news), 
                percentage=nrow(user_news)/users * 100, 
                retweeted_news_accounts=nrow(user_news[news_rt_ed > 0]),
                percentage2=nrow(user_news[news_rt_ed > 0]) / users * 100)
kable(d)

total_users	users_rt	percentage	retweeted_news_accounts	percentage2
26127625	18621165	71.27003	3517700	13.46353

Normalization

(Co-)Protagonism (relative)

A country is represented as a vector with respect to all other countries \(C\):

\[c_i = \Big( \frac{\text{total events protagonized by } c_i \text{ and } c_j}{\text{total events protagonized by } c_i}, c_j \in C \Big) \]

countries <- unique(prot, by=c("country_code"))[, country_code]
coprot <- prot %>% left_join(prot, by="component_id")

# do not consider self protagonism in events with more than 1 protagonist
# ...no se para qué, no lo uso
# coprot <- coprot[coprot[, country_code.x != country_code.y | .N == 1, by=component_id]$V1]

# eventos protagonizados por pais
prot.by_country <- prot[, .N, by=country_code]

coprot.vector <- list()
for(country in countries) {
  total_events_ci <- prot.by_country[country_code == country, N]
  coprot_i <- coprot[country_code.x == country]
  coprot_freq <- coprot_i[, .N, by=country_code.y]
  coprot.vector[[country]] <- coprot_freq[, per := coprot_freq$N / total_events_ci]
}

Co-protagonism of countries

Bar length represents the fraction of events protagonized by each country, with respect to total events protagonized by main country (in the title of each plot).

selected_countries <- c("US", "GB", "CA", "NG", "UA", "RU", "JP", "CN", "PS", "IL")


for(country in selected_countries) {
  tmp <- coprot.vector[[country]][country_code.y != country][order(per)]
  n <- nrow(tmp)
  p <- ggplot(tmp[(n - 15): n], aes(x=factor(country_code.y, levels=country_code.y), 
                                    y=per, label=as.character(round(per, 4)))) + 
    geom_bar(stat="identity") + coord_flip() + xlab("Country") + ylab("Co-protagonism") +
    geom_text(size=2.5, hjust=-.1, color=palette[6]) +
    scale_y_continuous(breaks=seq(0, 1, by=.125), limits = c(0, .5)) +
    scale_x_discrete(labels=map_country) +
    fte_theme() +  geom_hline(yintercept=0, size=0.4, color="black") 
  
  print(p + ggtitle(map_country(country)))
  ggsave(paste(c(PDF_PATH, "co-prot-", country, ".eps"), collapse = ""), p, 
         dpi=pl.DPI/1.5, width=pl.W-3, height=pl.H-1)
  ggsave(paste(c(PDF_PATH, "co-prot-", country, ".pdf"), collapse = ""), p, 
         dpi=pl.DPI/1.5, width=pl.W-3, height=pl.H-1) 
}

## Warning: Removed 1 rows containing missing values (position_stack).

## Warning: Removed 1 rows containing missing values (geom_text).

## Warning: Removed 1 rows containing missing values (position_stack).

## Warning: Removed 1 rows containing missing values (geom_text).

## Warning: Removed 1 rows containing missing values (position_stack).

## Warning: Removed 1 rows containing missing values (geom_text).

## Warning: Removed 1 rows containing missing values (position_stack).

## Warning: Removed 1 rows containing missing values (geom_text).

## Warning: Removed 1 rows containing missing values (position_stack).

## Warning: Removed 1 rows containing missing values (geom_text).

## Warning: Removed 1 rows containing missing values (position_stack).

## Warning: Removed 1 rows containing missing values (geom_text).

tmp <- data.table(prot %>% left_join(prot, "component_id"))[, mean(length(component_id)), by=country_code.x]
tmp <- tmp[, per:=tmp$V1 / sum(tmp$V1)][order(per)]

n <- nrow(tmp)
p <- ggplot(tmp[(n - 15): n], aes(x=factor(country_code.x, levels=country_code.x), 
                                  y=per, label=as.character(round(per, 4)))) + 
  geom_bar(stat="identity") + coord_flip() + xlab("Country") + ylab("Co-protagonism") +
  geom_text(size=2.5, hjust=-.1, color=palette[6]) +
  scale_y_continuous(breaks=seq(0, 1, by=.125), limits = c(0, .5)) +
  scale_x_discrete(labels=map_country) +
  fte_theme() +  geom_hline(yintercept=0, size=0.4, color="black") 

print(p + ggtitle("All"))

ggsave(paste(c(PDF_PATH, "co-prot-all", ".eps"), collapse = ""), p, 
       dpi=pl.DPI/1.5, width=pl.W-3, height=pl.H-1)
ggsave(paste(c(PDF_PATH, "co-prot-all", ".pdf"), collapse = ""), p, 
       dpi=pl.DPI/1.5, width=pl.W-3, height=pl.H-1)

Comparison between countries

Remake plots from Sec 6.1:

prot.comp <- list()
to_compare <- c("EC", "HN", "IL", "PS", "KP", "KR", "BD", "US", "GB", "UA", "RU", "MY", "VN", "CN", "JP", "CA", "NG")

for(cnt in to_compare) {
  covector <- c()
  for(i in seq(length(countries))) {
    c_i <- as.character(countries[i])
    
    per <- coprot.vector[[cnt]][country_code.y == c_i, per]
    if(length(per) == 0) {
      covector <- c(covector, 0)
    } else {
      covector <- c(covector, per)
    }
  }
  names(covector) <- countries
  prot.comp[[cnt]] <- covector
}


prot.comp_sel <- data.table(EC=prot.comp[["EC"]], 
                            HN=prot.comp[["HN"]], 
                            IL=prot.comp[["IL"]],
                            PS=prot.comp[["PS"]],
                            KP=prot.comp[["KP"]],
                            KR=prot.comp[["KR"]],
                            BD=prot.comp[["BD"]],
                            US=prot.comp[["US"]],
                            GB=prot.comp[["GB"]],
                            UA=prot.comp[["UA"]],
                            RU=prot.comp[["RU"]],
                            MY=prot.comp[["MY"]],
                            VN=prot.comp[["VN"]],
                            CN=prot.comp[["CN"]],
                            JP=prot.comp[["JP"]],
                            CA=prot.comp[["CA"]],
                            NG=prot.comp[["NG"]],
                            country=countries)

prot.comp_sel2 <- prot.comp_sel %>%
  gather(y, value, -country) %>%
  mutate(x = country) %>%
  select(x, y, value)
prot.comp_sel2 <- data.table(prot.comp_sel2)

pairs <- list(c("EC", "HN"), c("IL", "PS"), c("KP", "KR"), c("BD", "US"), c("GB", "US"),
              c("RU", "UA"), c("EC", "US"), c("IL", "KP"), c("CN", "MY"), c("JP", "CN"),
              c("CA", "NG"))

for(pair in pairs) {
  print(paste(map_country(pair), collapse = " & "))
  tmp <- prot.comp_sel2[y %in% pair]
  p <- ggplot(tmp, aes(x=reorder(x, value), 
                       y=value, 
                       group=factor(y, levels=pair),
                       color=factor(y, levels=pair),
                       shape=factor(y, levels=pair))) +
    geom_point(size=1, position="jitter", alpha=1) + 
    # geom_smooth(aes(x=reorder(x, value), y=value, group=y), size=.5) +
    xlab("Countries") + 
    ylab("Co-protagonism") +
    fte_theme() +
    # scale_x_discrete(labels=c())
    theme(legend.justification=c(0, 1), legend.position=c(0,1),
          legend.background = element_rect(fill="transparent"),
          legend.key.height=unit(1.8,"line"),
          axis.text=element_text(size=6),
          axis.text.x = element_blank()) + 
    scale_y_continuous(breaks=seq(0, 1, by=0.1), limits=c(0, 1.1)) +
    scale_color_discrete(name="Protagonist Country",
                         breaks=pair,
                         labels=map_country(pair)) +
    scale_shape_discrete(name="Protagonist Country",
                         solid = F,
                         breaks=pair,
                         labels=map_country(pair))
    # scale_x_discrete(labels=map_country)
  
  print(p + ggtitle(paste(map_country(pair), collapse = " & ")))
  
  ggsave(paste(c(PDF_PATH, paste(c("co-prot", pair), collapse = "-"), ".eps"), collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  ggsave(paste(c(PDF_PATH, paste(c("co-prot", pair), collapse = "-"), ".pdf"), collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
}

## [1] "Ecuador & Honduras"

## Warning: Removed 182 rows containing missing values (geom_point).

## Warning: Removed 182 rows containing missing values (geom_point).

## Warning: Removed 182 rows containing missing values (geom_point).

## [1] "Israel & Palestine"

## Warning: Removed 117 rows containing missing values (geom_point).

## Warning: Removed 107 rows containing missing values (geom_point).

## Warning: Removed 129 rows containing missing values (geom_point).

## [1] "North Korea & South Korea"

## Warning: Removed 124 rows containing missing values (geom_point).

## Warning: Removed 146 rows containing missing values (geom_point).

## Warning: Removed 130 rows containing missing values (geom_point).

## [1] "Bangladesh & United States"

## Warning: Removed 79 rows containing missing values (geom_point).

## Warning: Removed 81 rows containing missing values (geom_point).

## Warning: Removed 76 rows containing missing values (geom_point).

## [1] "Great Britain & United States"

## Warning: Removed 24 rows containing missing values (geom_point).

## Warning: Removed 23 rows containing missing values (geom_point).

## Warning: Removed 31 rows containing missing values (geom_point).

## [1] "Russia & Ukraine"

## Warning: Removed 85 rows containing missing values (geom_point).

## Warning: Removed 94 rows containing missing values (geom_point).

## Warning: Removed 99 rows containing missing values (geom_point).

## [1] "Ecuador & United States"

## Warning: Removed 99 rows containing missing values (geom_point).

## Warning: Removed 93 rows containing missing values (geom_point).

## Warning: Removed 87 rows containing missing values (geom_point).

## [1] "Israel & North Korea"

## Warning: Removed 113 rows containing missing values (geom_point).

## Warning: Removed 116 rows containing missing values (geom_point).

## Warning: Removed 122 rows containing missing values (geom_point).

## [1] "China & Malaysia"

## Warning: Removed 109 rows containing missing values (geom_point).

## Warning: Removed 113 rows containing missing values (geom_point).

## Warning: Removed 119 rows containing missing values (geom_point).

## [1] "Japan & China"

## Warning: Removed 105 rows containing missing values (geom_point).

## Warning: Removed 125 rows containing missing values (geom_point).

## Warning: Removed 96 rows containing missing values (geom_point).

## [1] "Canada & Nigeria"

## Warning: Removed 101 rows containing missing values (geom_point).

## Warning: Removed 107 rows containing missing values (geom_point).

## Warning: Removed 91 rows containing missing values (geom_point).

Export vectors to files:

for(c in countries) {
  x = coprot.vector[[c]]
  f = paste(c("~/galean/scripts/queries/data/protagonist_vectors/", c, ".txt"), collapse = "")
  
  write.table(x, f, row.names = F, sep = "\t")
}

Similar countries

Euclidean distance is not suitable for similarity between countries (vectors), due to the high sparsity of the vectors. Cosine similarity is better.

knn.cos <- data.table(read.table('~/galean/scripts/queries/data/knn_cos.txt', header=F, sep='\t', stringsAsFactors = T, na.strings = ''))
knn.euc <- data.table(read.table('~/galean/scripts/queries/data/knn_euc.txt', header=F, sep='\t', stringsAsFactors = T, na.strings = ''))

setnames(knn.cos, c("V1", "V2", "V3"), c("c1", "c2", "dist.cos"))
setnames(knn.euc, c("V1", "V2", "V3"), c("c1", "c2", "dist.euc"))

knn.euc.us <- knn.euc[c1 == "US" & c2 != "US"][order(dist.euc)]
ggplot(knn.euc.us, aes(x=factor(c2, levels=c2), y=1-dist.euc)) + geom_bar(stat="identity") + coord_flip() +
  xlab("Country") + ylab("Euclidean distance") + ggtitle("Most similar countries to US (euclidean distance) (higher is more similar)")

# US
knn.cos.us <- knn.cos[c1 == "US" & c2 != "US"][order(dist.cos)]
ggplot(knn.cos.us, aes(x=factor(c2, levels=c2), y=(1-dist.cos))) + geom_bar(stat="identity") + coord_flip() + 
  xlab("Country") + ylab("Cosine distance") + ggtitle("Most similar countries to US (cosine distance) (higher is more similar)")

# GB
knn.cos.gb <- knn.cos[c1 == "GB" & c2 != "GB"][order(dist.cos)]
ggplot(knn.cos.gb, aes(x=factor(c2, levels=c2), y=1-dist.cos)) + geom_bar(stat="identity") + coord_flip() +
  xlab("Country") + ylab("Cosine distance") + ggtitle("Most similar countries to GB (cosine distance) (higher is more similar)")

# UA
knn.cos.ua <- knn.cos[c1 == "UA" & c2 != "UA"][order(dist.cos)]
ggplot(knn.cos.ua, aes(x=factor(c2, levels=c2), y=1-dist.cos)) + geom_bar(stat="identity") + coord_flip() +
  xlab("Country") + ylab("Cosine distance") + ggtitle("Most similar countries to Ukraine (cosine distance) (higher is more similar)")

# VE
knn.cos.ve <- knn.cos[c1 == "VE" & c2 != "VE"][order(dist.cos)]
ggplot(knn.cos.ve, aes(x=factor(c2, levels=c2), y=1-dist.cos)) + geom_bar(stat="identity") + coord_flip() +
  xlab("Country") + ylab("Cosine distance") + ggtitle("Most similar countries to Venezuela (cosine distance) (higher is more similar)")

# CL
knn.cos.cl <- knn.cos[c1 == "CL" & c2 != "CL"][order(dist.cos)]
ggplot(knn.cos.cl, aes(x=factor(c2, levels=c2), y=1-dist.cos)) + geom_bar(stat="identity") + coord_flip() +
  xlab("Country") + ylab("Cosine distance") + ggtitle("Most similar countries to Chile (cosine distance) (higher is more similar)")

Get all countries with their most similar counterparts:

one_nn <- knn.cos[dist.cos != 0]
one_nn <- one_nn[one_nn[, .I[dist.cos == min(dist.cos)], by=c1]$V1]
one_nn <- na.omit(one_nn[order(dist.cos)])

one_nn.table <- one_nn[, list(map_country(c1), map_country(c2), dist.cos)]
one_nn.table <- one_nn.table[order(dist.cos)]

kable(one_nn.table[1:30]) #, format="latex", booktabs=T)

V1	V2	dist.cos
Russia	Ukraine	0.0079
Ukraine	Russia	0.0079
Canada	Nigeria	0.0211
Nigeria	Canada	0.0211
China	Japan	0.0239
Japan	China	0.0239
France	Italy	0.0259
Italy	France	0.0259
Peru	Canada	0.0266
Iraq	Syria	0.0301
Syria	Iraq	0.0301
Afghanistan	Cuba	0.0314
Cuba	Afghanistan	0.0314
Spain	France	0.0320
Mexico	Netherlands	0.0338
Netherlands	Mexico	0.0338
Kenya	Nigeria	0.0355
Puerto Rico	Seychelles	0.0362
Seychelles	Puerto Rico	0.0362
Iran	Iraq	0.0364
Libya	Iraq	0.0365
Venezuela	Cuba	0.0381
Germany	France	0.0384
Australia	India	0.0386
India	Australia	0.0386
United Arab Emirates	South Korea	0.0396
South Korea	United Arab Emirates	0.0396
Sudan	Libya	0.0404
Antarctica	Canada	0.0407
Nepal	South Africa	0.0429

knn.cos.pl <- knn.cos[order(dist.cos)]
knn.cos.pl <- knn.cos.pl[dist.cos < .05 & dist.cos > 0]
ggplot(knn.cos.pl, aes(x=reorder(c1, dist.cos), y=reorder(c2, dist.cos),
                       fill="white", label=as.character(round(dist.cos, 2)))) + 
  geom_tile(size=.1, color="white") + fte_theme() +
  geom_text(size=2, color=palette[9]) +
  # scale_x_discrete(labels=map_country) + scale_y_discrete(labels=map_country) +
  theme(axis.text.x=element_text(size=7, angle=45, hjust=1),
        axis.text.y=element_text(size=7)) + 
  labs(x="Country", y="Country")

(Co-)Protagonism (symmetric)

A country is represented as the vector of Jaccard similarities between it and the rest:

\[ c_i = \Big( \frac{\text{events protagonized by } c_i \text{ and } c_j}{\text{events protagonized by } c_i \text{ or } c_j}, c_j \in C \Big)\]

The co-protagonism between two countries is defined as follows:

\[ s(c1, c2) = \frac{|E(c1) \cap E(c2)|}{|E(c1) \cup E(c2)|} \]

Where \(E(c1)\) is the amount of events protagonized by \(c1\).

quantile(prot[, .N, by=country_code]$N, c(.5, .75, .9, .95, .99))

##     50%     75%     90%     95%     99% 
##   18.50   80.25  315.50  631.75 1683.70

prot.events_per_country <- prot[, .N, by=country_code]
summary(prot.events_per_country$N)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     1.00     4.00    18.50   166.40    80.25 10160.00

to_remove <- prot.events_per_country[N < mean(prot.events_per_country$N)]$country_code
prot.f <- prot[!(country_code %in% to_remove)]

countries <- unique(prot.f, by=c("country_code"))[, country_code]
cis <- c()
cjs <- c()
jacs <- c()
for(ci in countries) {
  events_ci <- prot[country_code == ci]$component_id
  for(cj in countries) {
    events_cj <- prot[country_code == cj]$component_id
    
    jacc <- length(intersect(events_ci, events_cj)) / length(union(events_ci, events_cj))
    cis <- c(cis, ci)
    cjs <- c(cjs, cj)
    jacs <- c(jacs, jacc)
  }
}

prot.sim <- data.table(country.x = cis, country.y = cjs, jacc.sim = jacs)
prot.most_similar <- prot.sim[jacc.sim > 0 & jacc.sim < 1][order(-jacc.sim)]
setnames(prot.most_similar, "country.x", "country_code")
tmp <- prot.most_similar %>% left_join(prot.by_country, "country_code")
setnames(tmp, c("country_code", "country.y"), c("country.x", "country_code"))
tmp <- tmp %>% left_join(prot.by_country, "country_code")
setnames(tmp, c("country_code"), c("country.y"))
prot.most_similar <- tmp
prot.most_similar <- prot.most_similar[order(-jacc.sim)]

prot.most_similar.table <- prot.most_similar[1:50][, list(map_country(country.x),
                                                    map_country(country.y),
                                                    N.x, N.y,
                                                    jacc.sim)]
kable(prot.most_similar.table[1:50])#, format="latex", booktabs=T)

V1	V2	N.x	N.y	jacc.sim
Palestine	Israel	360	561	0.2863128
Israel	Palestine	561	360	0.2863128
Ukraine	Russia	921	823	0.2094313
Russia	Ukraine	823	921	0.2094313
United States	Great Britain	10162	4015	0.0966120
Great Britain	United States	4015	10162	0.0966120
Syria	Iraq	647	654	0.0832639
Iraq	Syria	654	647	0.0832639
Pakistan	India	453	1561	0.0752803
India	Pakistan	1561	453	0.0752803
Iran	Israel	496	561	0.0698381
Israel	Iran	561	496	0.0698381
Japan	China	354	646	0.0604454
China	Japan	646	354	0.0604454
France	Germany	627	371	0.0583245
Germany	France	371	627	0.0583245
Great Britain	Australia	4015	974	0.0576638
Australia	Great Britain	974	4015	0.0576638
Germany	Brazil	371	236	0.0574913
Brazil	Germany	236	371	0.0574913
Turkey	Syria	198	647	0.0536160
Syria	Turkey	647	198	0.0536160
Iran	Iraq	496	654	0.0511883
Iraq	Iran	654	496	0.0511883
Malaysia	Australia	262	974	0.0492360
Australia	Malaysia	974	262	0.0492360
India	Australia	1561	974	0.0475207
Australia	India	974	1561	0.0475207
Great Britain	Canada	4015	715	0.0443807
Canada	Great Britain	715	4015	0.0443807
Libya	Egypt	253	316	0.0440367
Egypt	Libya	316	253	0.0440367
India	Great Britain	1561	4015	0.0436085
Great Britain	India	4015	1561	0.0436085
Palestine	Egypt	360	316	0.0432099
Egypt	Palestine	316	360	0.0432099
Syria	Iran	647	496	0.0428832
Iran	Syria	496	647	0.0428832
Spain	Germany	258	371	0.0413907
Germany	Spain	371	258	0.0413907
United States	Canada	10162	715	0.0370900
Canada	United States	715	10162	0.0370900
Great Britain	France	4015	627	0.0363921
France	Great Britain	627	4015	0.0363921
Yemen	Iran	202	496	0.0356083
Iran	Yemen	496	202	0.0356083
India	China	1561	646	0.0346929
China	India	646	1561	0.0346929
United States	India	10162	1561	0.0345041
India	United States	1561	10162	0.0345041

ggplot(prot.most_similar, aes(x=reorder(country.x, -jacc.sim), y=reorder(country.y, -jacc.sim),
                        fill=jacc.sim, label=as.character(round(jacc.sim, 2)))) + 
  geom_tile(size=.1) + fte_theme() +
  geom_text(size=2, color=palette[1]) +
  scale_x_discrete(labels=map_country) + scale_y_discrete(labels=map_country) +
  theme(axis.text.x=element_text(size=7, angle=45, hjust=1),
        axis.text.y=element_text(size=7)) + 
  labs(x="Country", y="Country")+ scale_fill_gradient(trans="log")

selected_countries <- c("IL", "PS", "UA", "RU", "GB", "US", "IQ", "SY")

for(country in selected_countries) {
  tmp <- prot.sim[country.x == country][order(-jacc.sim)][jacc.sim != 1][1:10]
  tmp <- tmp[, list(Country=map_country(country.y), Similarity=jacc.sim)]
  t <- kable(tmp)#, format="latex", booktabs=T)
  print(map_country(country))
  print(t)
}

##       IL 
## "Israel" 
## 
## 
## Country          Similarity
## --------------  -----------
## Palestine         0.2863128
## Iran              0.0698381
## Egypt             0.0317647
## Syria             0.0307167
## France            0.0259067
## United States     0.0208492
## Iraq              0.0201511
## Pakistan          0.0201207
## Great Britain     0.0184732
## Ukraine           0.0178571
##          PS 
## "Palestine" 
## 
## 
## Country          Similarity
## --------------  -----------
## Israel            0.2863128
## Egypt             0.0432099
## Nigeria           0.0157895
## Pakistan          0.0149813
## Iraq              0.0129870
## Great Britain     0.0115607
## Ukraine           0.0110497
## Syria             0.0110442
## China             0.0100402
## United States     0.0090142
##        UA 
## "Ukraine" 
## 
## 
## Country          Similarity
## --------------  -----------
## Russia            0.2094313
## United States     0.0249699
## France            0.0238095
## Germany           0.0213439
## Syria             0.0188434
## Great Britain     0.0183619
## Israel            0.0178571
## Malaysia          0.0154506
## Iraq              0.0148196
## Nigeria           0.0144597
##       RU 
## "Russia" 
## 
## 
## Country          Similarity
## --------------  -----------
## Ukraine           0.2094313
## France            0.0305615
## Canada            0.0301407
## Syria             0.0286914
## United States     0.0285581
## Germany           0.0275387
## China             0.0265549
## Great Britain     0.0241321
## Iran              0.0240683
## Turkey            0.0179462
##              GB 
## "Great Britain" 
## 
## 
## Country          Similarity
## --------------  -----------
## United States     0.0966120
## Australia         0.0576638
## Canada            0.0443807
## India             0.0436085
## France            0.0363921
## Syria             0.0311878
## Nigeria           0.0257183
## Russia            0.0241321
## Pakistan          0.0238313
## China             0.0230465
##              US 
## "United States" 
## 
## 
## Country          Similarity
## --------------  -----------
## Great Britain     0.0966120
## Canada            0.0370900
## India             0.0345041
## Australia         0.0319711
## Iraq              0.0288215
## Russia            0.0285581
## Iran              0.0271781
## China             0.0256216
## Syria             0.0252300
## Ukraine           0.0249699
##     IQ 
## "Iraq" 
## 
## 
## Country          Similarity
## --------------  -----------
## Syria             0.0832639
## Iran              0.0511883
## United States     0.0288215
## Libya             0.0283447
## Turkey            0.0228091
## Great Britain     0.0205464
## Israel            0.0201511
## Afghanistan       0.0198330
## Pakistan          0.0193370
## Nigeria           0.0152381
##      SY 
## "Syria" 
## 
## 
## Country          Similarity
## --------------  -----------
## Iraq              0.0832639
## Turkey            0.0536160
## Iran              0.0428832
## Great Britain     0.0311878
## Israel            0.0307167
## Russia            0.0286914
## United States     0.0252300
## Libya             0.0250569
## France            0.0208333
## Ukraine           0.0188434

Interest (relative to max)

The interest of a country is the fraction of the maximum number of tweets it has published in one event.

part.filtered <- part.tidy[tweets > 0]
setkey(part.filtered, component_id)

part.frac <- part.filtered[, list(component_id, tweets, lapply(.SD, function(x) as.numeric(x / max(x)))$tweets), by=country]
setnames(part.frac, "V3", "interest")

Most interest in concentrated below \(0.25\).

p <- ggplot(part.frac, aes(x=interest)) + geom_histogram(binwidth = 0.01) +
  fte_theme() +  geom_hline(yintercept=0, size=0.4, color="black") +
  xlab("Interest") + ylab("Frequency") + scale_y_continuous(labels=comma)

print(p)

ggsave(paste(c(PDF_PATH, "interest-hist.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "interest-hist.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)

These countries have more than one event with interest equal to 1.0, possibly meaning that they have few tweets:

(part.few <- part.frac[interest == 1, which(.N > 1), by=country])

##     country V1
##  1:      DZ  1
##  2:      DM  1
##  3:      CV  1
##  4:      LC  1
##  5:      PW  1
##  6:      TJ  1
##  7:      SH  1
##  8:      SM  1
##  9:      AI  1
## 10:      BV  1
## 11:      CC  1
## 12:      TK  1
## 13:      KY  1
## 14:      HM  1
## 15:      CX  1

(part.few2 <- part.filtered[country %in% part.few$country])

##       component_id country tweets
##    1:            4      DZ      1
##    2:            7      DZ      1
##    3:           11      DM      1
##    4:           12      CV      1
##    5:           17      DM      1
##   ---                            
## 7751:        25473      DZ      1
## 7752:        25474      DZ      8
## 7753:        25476      DZ      3
## 7754:        25477      TJ      1
## 7755:        25478      DZ      1

quantile(part.few2$tweets, c(.5, .75, .9, .99, .999))

##    50%    75%    90%    99%  99.9% 
##  1.000  3.000  6.000 26.000 61.246

Re-filter the table to get rid of countries with very low tweets (90 percentile):

part.filtered <- part.tidy[tweets > 6]
setkey(part.filtered, component_id)
part.frac <- part.filtered[, list(component_id, 
                                  tweets, 
                                  lapply(.SD, function(x) as.numeric(x / max(x)))$tweets), 
                           by=country]
setnames(part.frac, "V3", "interest")
part.frac[, country := factor(country)]

p <- ggplot(part.frac, aes(x=interest)) + geom_histogram(binwidth = 0.01) +
  fte_theme() +  geom_hline(yintercept=0, size=0.4, color="black") +
  xlab("Interest") + ylab("Frequency") + scale_y_continuous(labels=comma)

print(p)

ggsave(paste(c(PDF_PATH, "interest-hist-f.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "interest-hist-f.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)

Distribution of interest

selected_countries <- c("US", "GB", "UA", "RU", "BR", "CL", "SY", "IQ", "IL", "PS")
p <- ggplot(part.frac[country %in% selected_countries], aes(x=country, y=interest)) + 
  geom_boxplot() + fte_theme() + scale_x_discrete(labels=map_country) +
  xlab("") + ylab("Interest")

print(p + ggtitle("Distribution of interest of selected countries"))

ggsave(paste(c(PDF_PATH, "interest-dist.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "interest-dist.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)

Comparison between countries

Re-make the plots of Fig. 6.

Each plot is the average interest of each country (x-axis) to the events protagonized by the two selected countries.

part.wrt.all <- part.frac %>%
  full_join(prot, by=c("component_id")) %>%
  filter(!is.na(country_code)) %>%
  mutate(c.interested=country,
        c.protagonist=country_code) %>%
  select(component_id, c.interested, c.protagonist, interest) %>%
  group_by(c.interested, c.protagonist) %>%
  summarise(avg.interest=mean(interest))

part.wrt.all <- data.table(na.omit(part.wrt.all))

selected_pairs = list(c("RU", "UA"), c("IL", "PS"), c("BR", "DE"), 
                      c("GB", "US"), c("BR", "US"), c("DE", "IL"), 
                      c("BR", "PS"), c("MY", "VN"), c("CN", "MY"),
                      c("BO", "GA"), c("PY", "TJ"), c("FO", "SJ"),
                      c("MY", "UA"), c("AR", "BR"), c("CS", "TM"))

for(pair in selected_pairs) {
  print(paste(map_country(pair), collapse = " & "))
  tmp <- part.wrt.all[c.protagonist %in% pair]
  p <- ggplot(tmp,
              aes(x=reorder(c.interested, avg.interest), 
                  y=avg.interest, 
                  color=factor(c.protagonist, levels=pair),
                  shape=factor(c.protagonist, levels=pair))) + 
    geom_point(size=1, position="jitter", alpha=1) + 
    # geom_smooth(aes(x=reorder(c.interested, avg.interest), y=avg.interest, group=c.protagonist), size=.5) +
    xlab("Interested countries") + 
    ylab("Interest") +
    fte_theme() + theme(axis.text.x = element_blank()) +
    theme(legend.justification=c(0, 1), legend.position=c(0,1),
          legend.background = element_rect(fill="transparent"),
          legend.key.height=unit(1.8,"line"),
          axis.text=element_text(size=6)) +
    scale_y_continuous(breaks=seq(0, 1, by=0.1), limits=c(0, 1.1)) +
    scale_color_discrete(name="Protagonist Country",
                         breaks=pair,
                         labels=map_country(pair)) +
    scale_shape_discrete(name="Protagonist Country",
                         breaks=pair,
                         solid=F,
                         labels=map_country(pair)) 
    # scale_x_discrete(labels=map_country)
  
  print(p + ggtitle(paste(map_country(pair), collapse = " & ")))

  ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".eps"), collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".pdf"), collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  
}

## [1] "Russia & Ukraine"

## [1] "Israel & Palestine"

## [1] "Brazil & Germany"

## [1] "Great Britain & United States"

## [1] "Brazil & United States"

## [1] "Germany & Israel"

## [1] "Brazil & Palestine"

## [1] "Malaysia & Vietnam"

## [1] "China & Malaysia"

## [1] "Bolivia & Gabon"

## [1] "Paraguay & Tajikistan"

## [1] "Faroe Islands & Svalbard and Jan Mayen"

## [1] "Malaysia & Ukraine"

## [1] "Argentina & Brazil"

## [1] "Serbia & Turkmenistan"

Export vectors to files:

for(c in countries) {
  x = part.wrt.all[c.protagonist == c]
  f = paste(c("~/galean/scripts/queries/data/interest_vectors/", c, ".txt"), collapse = "")
  
  write.table(x, f, row.names = F, sep = "\t")
}

Similar countries

knn.int.cos <- data.table(read.table('~/galean/scripts/queries/data/knn_int_cos.txt', 
                                     header=F, sep='\t', 
                                     stringsAsFactors = T, na.strings = ''))
setnames(knn.int.cos, c("V1", "V2", "V3"), c("c1", "c2", "dist.cos"))

one_nn <- knn.int.cos[dist.cos != 1]
one_nn <- one_nn[one_nn[, .I[dist.cos == max(dist.cos)], by=c1]$V1]
one_nn <- na.omit(one_nn[order(-dist.cos)])

one_nn.table <- one_nn[, list(map_country(c1), map_country(c2), dist.cos)]
one_nn.table <- one_nn.table[order(-dist.cos)]

kable(one_nn.table[1:30]) #, format="latex", booktabs=T)

V1	V2	dist.cos
Solomon Islands	Hungary	0.9999
Solomon Islands	Dominica	0.9999
Solomon Islands	Bulgaria	0.9999
Solomon Islands	Faroe Islands	0.9999
Solomon Islands	Puerto Rico	0.9999
Solomon Islands	Estonia	0.9999
Solomon Islands	Republic of the Congo	0.9999
Solomon Islands	Svalbard and Jan Mayen	0.9999
Solomon Islands	Ivory Coast	0.9999
Solomon Islands	Montenegro	0.9999
Gabon	NA	0.9977
Bolivia	NA	0.9976
Bhutan	Montenegro	0.9975
Bhutan	Maldives	0.9975
Fiji	Syria	0.9945
Gambia	Jordan	0.9917
Togo	Dominica	0.9892
Poland	Afghanistan	0.9869
Vanuatu	Haiti	0.9825
Lesotho	Slovakia	0.9799
Cook Islands	Belize	0.9773
Mongolia	Belize	0.9773
Greenland	NA	0.9769
Cocos [Keeling] Islands	NA	0.9758
Ecuador	Lithuania	0.9752
East Timor	Haiti	0.9750
Burundi	Belgium	0.9745
Somalia	Finland	0.9704
Macedonia	New Zealand	0.9675
Sierra Leone	Nigeria	0.9673

knn.int.cos.pl <- knn.int.cos[order(dist.cos)]
knn.int.cos.pl <- knn.int.cos.pl[dist.cos < .5 & dist.cos > 0]
ggplot(knn.int.cos.pl, aes(x=reorder(c1, dist.cos), y=reorder(c2, dist.cos),
                       fill="white", label=as.character(round(dist.cos, 2)))) + 
  geom_tile(size=.1, color="white") + fte_theme() +
  geom_text(size=2, color=palette[9]) +
  scale_x_discrete(labels=map_country) + scale_y_discrete(labels=map_country) +
  theme(axis.text.x=element_text(size=7, angle=45, hjust=1),
        axis.text.y=element_text(size=7)) + 
  labs(x="Country", y="Country")

Events with most interest

quantile(part.frac[, interest], c(.5, .75, .8, .9, .99))

##         50%         75%         80%         90%         99% 
## 0.008673027 0.026315789 0.034852547 0.070422535 0.319908521

part.median_interest <- part.frac[interest > median(interest)]

part.countries_by_event <- part.median_interest[, max(.N), by=component_id]

Events with maximum number of countries interested (and with interest > median interest):

kable(part.countries_by_event[order(-V1)][1:10],
      col.names=c("component_id", "no of countries"))

component_id	no of countries
15945	180
14355	160
18989	160
13187	155
2878	154
24460	154
12321	153
298	151
11432	150
14577	150

Even with no filtering (interest > 0), the resulting list is unchanged. That means, the events with most countries interested produced high interest from every country (at least in top 50%).

Interest (relative to sum)

Measure the interest of a country as the percentage of all tweets issued from that country.

part.filtered <- part.tidy[tweets > 0]
setkey(part.filtered, component_id)

part.frac.all <- part.filtered[, list(component_id, tweets, lapply(.SD, function(x) as.numeric(x / sum(x)))$tweets), by=country]
setnames(part.frac.all, "V3", "interest")

Distribution of interest

Countries with largest IQR of interest:

kable(part.frac.all[, IQR(interest), by=country][order(-V1)][1:30])

country	V1
MC	0.0033784
NU	0.0033333
GS	0.0021097
VA	0.0020270
MO	0.0018904
IO	0.0017123
SZ	0.0014286
SR	0.0013280
SL	0.0010718
KG	0.0009416
AG	0.0007955
BN	0.0007794
CR	0.0007452
CG	0.0006998
ER	0.0006840
GW	0.0006566
MP	0.0006532
LS	0.0006184
SV	0.0005797
GU	0.0005376
UZ	0.0005280
VI	0.0005198
GA	0.0004647
LR	0.0004505
LI	0.0004446
MN	0.0004286
FJ	0.0004186
LU	0.0003748
MU	0.0003733
MQ	0.0003546

selected_countries <- c("US", "GB", "UA", "RU", "BR", "CL", "SY", "IQ", "IL", "PS")
p <- ggplot(part.frac.all[country %in% selected_countries], aes(x=country, y=interest)) + 
  geom_boxplot() + fte_theme() + scale_x_discrete(labels=map_country) +
  xlab("") + ylab("Interest")

print(p + ggtitle("Distribution of interest of selected countries"))

ggsave(paste(c(PDF_PATH, "interest-all-dist.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "interest-all-dist.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)

Comparison between countries

part.wrt.all <- part.frac.all %>%
  full_join(prot, by=c("component_id")) %>%
  filter(!is.na(country_code)) %>%
  mutate(c.interested=country,
        c.protagonist=country_code) %>%
  select(component_id, c.interested, c.protagonist, interest) %>%
  group_by(c.interested, c.protagonist) %>%
  summarise(avg.interest=mean(interest))

part.wrt.all <- data.table(na.omit(part.wrt.all))

selected_pairs = list(c("RU", "UA"), c("IL", "PS"), c("BR", "DE"), 
                      c("GB", "US"), c("BR", "US"), c("DE", "IL"), 
                      c("BR", "PS"), c("MY", "VN"), c("CN", "MY"),
                      c("BO", "GA"), c("PY", "TJ"), c("FO", "SJ"),
                      c("MY", "UA"), c("AR", "BR"), c("CS", "TM"))

for(pair in selected_pairs) {
  print(paste(map_country(pair), collapse = " & "))
  tmp <- part.wrt.all[c.protagonist %in% pair]
  p <- ggplot(tmp,
              aes(x=reorder(c.interested, avg.interest), 
                  y=avg.interest, 
                  color=factor(c.protagonist, levels=pair),
                  shape=factor(c.protagonist, levels=pair))) + 
    geom_point(size=1, position="jitter", alpha=1) + 
    # geom_smooth(aes(x=reorder(c.interested, avg.interest), y=avg.interest, group=c.protagonist), size=.5) +
    xlab("Interested countries") + 
    ylab("Interest") +
    fte_theme() + theme(axis.text.x = element_blank()) +
    theme(legend.justification=c(0, 1), legend.position=c(0,1),
          legend.background = element_rect(fill="transparent"),
          legend.key.height=unit(1.8,"line"),
          axis.text=element_text(size=6)) +
    scale_y_continuous(breaks=seq(0, 1, by=0.1), limits=c(0, 1.1)) +
    scale_color_discrete(name="Protagonist Country",
                         breaks=pair,
                         labels=map_country(pair)) +
    scale_shape_discrete(name="Protagonist Country",
                         breaks=pair,
                         solid=F,
                         labels=map_country(pair)) 
    # scale_x_discrete(labels=map_country)
  
  print(p + ggtitle(paste(map_country(pair), collapse = " & ")))

  ggsave(paste(c(PDF_PATH, paste(c("int-prot-all", pair), collapse = "-"), ".eps"), 
               collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  ggsave(paste(c(PDF_PATH, paste(c("int-prot-all", pair), collapse = "-"), ".pdf"), 
               collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  
}

## [1] "Russia & Ukraine"

## [1] "Israel & Palestine"

## [1] "Brazil & Germany"

## [1] "Great Britain & United States"

## [1] "Brazil & United States"

## [1] "Germany & Israel"

## [1] "Brazil & Palestine"

## [1] "Malaysia & Vietnam"

## [1] "China & Malaysia"

## [1] "Bolivia & Gabon"

## [1] "Paraguay & Tajikistan"

## [1] "Faroe Islands & Svalbard and Jan Mayen"

## [1] "Malaysia & Ukraine"

## [1] "Argentina & Brazil"

## [1] "Serbia & Turkmenistan"

Export vectors to files:

countries <- unique(part.wrt.all$c.protagonist)
for(c in countries) {
  x = part.wrt.all[c.protagonist == c]
  f = paste(c("~/galean/scripts/queries/data/interest_vectors/", c, ".txt"), collapse = "")
  
  write.table(x, f, row.names = F, sep = "\t")
}

Similar countries

knn.int.cos <- data.table(read.table('~/galean/scripts/queries/data/knn_int_cos2.txt', 
                                     header=F, sep='\t', 
                                     stringsAsFactors = T, na.strings = ''))
setnames(knn.int.cos, c("V1", "V2", "V3"), c("c1", "c2", "dist.cos"))

one_nn <- knn.int.cos[dist.cos != 1]
one_nn <- one_nn[one_nn[, .I[dist.cos == max(dist.cos)], by=c1]$V1]
one_nn <- na.omit(one_nn[order(-dist.cos)])

one_nn.table <- one_nn[, list(map_country(c1), map_country(c2), dist.cos)]
one_nn.table <- one_nn.table[order(-dist.cos)]

kable(one_nn.table[1:30]) #, format="latex", booktabs=T)

V1	V2	dist.cos
Cuba	United States	0.9995
United States	Cuba	0.9995
Costa Rica	Uruguay	0.9994
Uruguay	Costa Rica	0.9994
Argentina	Brazil	0.9993
Brazil	Argentina	0.9993
Bosnia and Herzegovina	Honduras	0.9992
Honduras	Bosnia and Herzegovina	0.9992
Honduras	Croatia	0.9992
Croatia	Costa Rica	0.9992
Croatia	Honduras	0.9992
Denmark	Russia	0.9991
Spain	Argentina	0.9991
Iraq	North Korea	0.9991
North Korea	Iraq	0.9991
Russia	Denmark	0.9991
Libya	Cuba	0.9990
Mexico	Peru	0.9990
Peru	Mexico	0.9990
Taiwan	Denmark	0.9989
Vietnam	Denmark	0.9989
Botswana	United States	0.9988
Israel	Iraq	0.9988
Aruba	Libya	0.9987
France	Italy	0.9987
Guyana	Papua	0.9987
Italy	France	0.9987
Papua	Guyana	0.9987
Democratic Republic of the Congo	Colombia	0.9986
Colombia	Democratic Republic of the Congo	0.9986

ggplot(one_nn[1:30], aes(x=reorder(c1, -dist.cos), y=reorder(c2, -dist.cos),
                       fill=dist.cos, label=as.character(round(dist.cos, 2)))) + 
  geom_tile(size=.1) + fte_theme() +
  geom_text(size=2, color=palette[1]) +
  scale_x_discrete(labels=map_country) + scale_y_discrete(labels=map_country) +
  theme(axis.text.x=element_text(size=7, angle=45, hjust=1),
        axis.text.y=element_text(size=7)) + 
  labs(x="Country", y="Country")

part.wrt.all <- part.frac.all %>%
  full_join(prot, by=c("component_id")) %>%
  filter(!is.na(country_code)) %>%
  mutate(c.interested=country,
        c.protagonist=country_code) %>%
  select(component_id, c.interested, c.protagonist, interest) %>%
  group_by(c.interested, c.protagonist) %>%
  summarise(avg.interest=mean(interest))

part.wrt.all <- data.table(na.omit(part.wrt.all))

selected_pairs = list(c("FO", "SJ"), c("MY", "UA"), c("FJ", "VU"), 
                      c("RS", "TM"), c("CN", "DE"), c("PY", "TJ"), 
                      c("NL", "MY"), c("BO", "TL"), c("GA", "TL"),
                      c("AU", "DE"), c("PL", "TR"), c("IN", "AU"),
                      c("AT", "IQ"), c("KE", "YE"), c("AM", "VA"))

for(pair in selected_pairs) {
  print(paste(map_country(pair), collapse = " & "))
  tmp <- part.wrt.all[c.protagonist %in% pair]
  p <- ggplot(tmp,
              aes(x=reorder(c.interested, avg.interest), 
                  y=avg.interest, 
                  color=factor(c.protagonist, levels=pair),
                  shape=factor(c.protagonist, levels=pair))) + 
    geom_point(size=1, position="jitter", alpha=1) + 
    # geom_smooth(aes(x=reorder(c.interested, avg.interest), y=avg.interest, group=c.protagonist), size=.5) +
    xlab("Interested countries") + 
    ylab("Interest") +
    fte_theme() + theme(axis.text.x = element_blank()) +
    theme(legend.justification=c(0, 1), legend.position=c(0,1),
          legend.background = element_rect(fill="transparent"),
          legend.key.height=unit(1.8,"line"),
          axis.text=element_text(size=6)) +
    scale_y_continuous(breaks=seq(0, 1, by=0.1), limits=c(0, 1.1)) +
    scale_color_discrete(name="Protagonist Country",
                         breaks=pair,
                         labels=map_country(pair)) +
    scale_shape_discrete(name="Protagonist Country",
                         breaks=pair,
                         solid=F,
                         labels=map_country(pair)) 
    # scale_x_discrete(labels=map_country)
  
  print(p + ggtitle(paste(map_country(pair), collapse = " & ")))

  ggsave(paste(c(PDF_PATH, paste(c("int-prot-all", pair), collapse = "-"), ".eps"), 
               collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  ggsave(paste(c(PDF_PATH, paste(c("int-prot-all", pair), collapse = "-"), ".pdf"), 
               collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  
}

## [1] "Faroe Islands & Svalbard and Jan Mayen"

## [1] "Malaysia & Ukraine"

## [1] "Fiji & Vanuatu"

## [1] "Serbia & Turkmenistan"

## [1] "China & Germany"

## [1] "Paraguay & Tajikistan"

## [1] "Netherlands & Malaysia"

## [1] "Bolivia & East Timor"

## [1] "Gabon & East Timor"

## [1] "Australia & Germany"

## [1] "Poland & Turkey"

## [1] "India & Australia"

## [1] "Austria & Iraq"

## [1] "Kenya & Yemen"

## [1] "Armenia & Vatican City"

Events with most interest

summary(part.frac.all[, interest])

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000001 0.0000181 0.0000542 0.0002377 0.0001670 0.5000000

quantile(part.frac.all[, interest], c(.5, .75, .8, .9, .99))

##          50%          75%          80%          90%          99% 
## 5.424464e-05 1.669728e-04 2.135535e-04 4.446421e-04 2.701790e-03

part.median_interest <- part.frac.all[interest > 2.701790e-03]

part.countries_by_event <- part.median_interest[, max(.N), by=component_id]

Events with maximum number of countries interested (and with interest > median interest):

kable(part.countries_by_event[order(-V1)][1:10],
      col.names=c("component_id", "no of countries"))

component_id	no of countries
15945	202
298	153
14355	144
12321	143
18989	130
2878	127
13187	106
17406	102
19427	97
24460	97

events <- c("Death of actor Robin Williams.",
            "2014 FIFA World Cup final between Germany and Argentina.",
            "2014 FIFA World Cup starts.",
            "2015 Super Bowl starts.",
            "New Year's Eve 2013",
            "Soccer Player Luis Suarez is suspended from 2014 World Cup.",
            "Charlie Hebdo shooting in Paris.",
            "2015 Grammy Awards.",
            "Professional boxing match between Floyd Mayweather and Manny Pacquiao.")
dates <- c("2014-08-12",
           "2014-07-13",
           "2014-06-12",
           "2015-02-02",
           "2013-12-31",
           "2014-06-26",
           "2015-01-07",
           "2015-02-09",
           "2015-05-03")
countries_affected <- c(202, 144, 143, 130, 127, 106, 102, 97, 97)

hi_events <- data.table(Description=events, Date=dates, Countries=countries_affected)

kable(hi_events, format="latex", booktabs=T)

\begin{tabular}{llr} \toprule Description & Date & Countries\\ \midrule Death of actor Robin Williams. & 2014-08-12 & 202\\ 2014 FIFA World Cup final between Germany and Argentina. & 2014-07-13 & 144\\ 2014 FIFA World Cup starts. & 2014-06-12 & 143\\ 2015 Super Bowl starts. & 2015-02-02 & 130\\ New Year's Eve 2013 & 2013-12-31 & 127\\ \addlinespace Soccer Player Luis Suarez is suspended from 2014 World Cup. & 2014-06-26 & 106\\ Charlie Hebdo shooting in Paris. & 2015-01-07 & 102\\ 2015 Grammy Awards. & 2015-02-09 & 97\\ Professional boxing match between Floyd Mayweather and Manny Pacquiao. & 2015-05-03 & 97\\ \bottomrule \end{tabular}

Self-interest

part.wrt.all <- part.frac.all %>%
  full_join(prot, by=c("component_id")) %>%
  filter(!is.na(country_code)) %>%
  mutate(c.interested=country,
        c.protagonist=country_code) %>%
  select(component_id, c.interested, c.protagonist, interest)

part.wrt.all <- na.omit(data.table(part.wrt.all))

cs <- c()
si <- c()
for(cnt in countries_list$country.code) {
  cs <- c(cs, cnt)
  si <- c(si, part.wrt.all[c.protagonist == cnt & c.interested == cnt][, sum(interest)])
}

part.self <- data.table(c.interested=cs, self.interest=si)
# part.self <- part.self[order(-self.interest)][, list(country=map_country(country),
#                                                      self.interest)]
part.self <- part.wrt.all[, .N, by=c.interested] %>% left_join(part.self, "c.interested")
part.self <- part.self[order(-self.interest)]

p <- ggplot(part.self[1:15], aes(x=reorder(c.interested, self.interest), 
                      y=self.interest, 
                      label=comma(N))) +
  geom_bar(stat="identity") + coord_flip() + geom_text(size=2.5, hjust=-.2, color=palette[6]) +
  scale_y_continuous(labels=comma, breaks=seq(0, 1, by=0.1), limits = c(0, 1)) +
  scale_x_discrete(labels=map_country) +
  fte_theme() +  geom_hline(yintercept=0, size=0.4, color="black") +
  xlab("Country") + ylab("Interest") 

print(p + ggtitle("Interest of countries in events protagonized by them"))

ggsave(paste(c(PDF_PATH, "self-interest.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "self-interest.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)

Interest (relative to mean)

part.tidy2 <- part.tidy[tweets > 0]
part.wrt.all <- part.tidy2 %>% 
  full_join(prot, by="component_id") %>%
  mutate(c.interested = country, c.protagonist = country_code) %>%
  select(component_id, c.protagonist, c.interested, tweets) %>%
  group_by(c.interested, c.protagonist) %>%
  summarise(events=length(component_id))
            

(part.wrt.all <- data.table(na.omit(part.wrt.all)))

##        c.interested c.protagonist events
##     1:           AD            AE      4
##     2:           AD            AF      5
##     3:           AD            AR      4
##     4:           AD            AU     22
##     5:           AD            BB      1
##    ---                                  
## 31679:           ZW            XK      1
## 31680:           ZW            YE     68
## 31681:           ZW            ZA    124
## 31682:           ZW            ZM      1
## 31683:           ZW            ZW     27

# part.wrt.all <- part.wrt.all[, .SD[sum(events) > 50], 
                             # by=c.interested][, .SD[sum(events) > 50], 
                                              # by=c.protagonist]

part.wrt.all[, sc.events:=scale(events), by=c.interested]
# tmp2[, sc.events:=scale(events), by=c.interested]

selected_pairs = list(c("UA", "RU"), c("IL", "PS"), c("BR", "DE"), 
                      c("US", "GB"))

for(pair in selected_pairs) {
  print(paste(map_country(pair), collapse = " & "))
  tmp <- part.wrt.all[c.protagonist %in% pair]
  p <- ggplot(tmp,
              aes(x=reorder(c.interested, sc.events), 
                  y=sc.events, 
                  color=factor(c.protagonist, levels=pair),
                  shape=factor(c.protagonist, levels=pair))) + 
    geom_point(size=1, position="jitter", alpha=1) + 
    geom_smooth(aes(group=c.protagonist)) +
    # geom_line(aes(group=c.protagonist)) +
    # geom_smooth(aes(x=reorder(c.interested, avg.interest), y=avg.interest, group=c.protagonist), size=.5) +
    xlab("Interested countries") + 
    ylab("Interest") +
    fte_theme() + theme(axis.text.x = element_blank()) +
    theme(legend.justification=c(0, 1), legend.position=c(0,1),
          legend.background = element_rect(fill="transparent"),
          legend.key.height=unit(1.8,"line"),
          axis.text=element_text(size=6)) +
    # scale_y_continuous(breaks=seq(0, 1, by=0.1), limits=c(0, 1.1)) +
    scale_color_discrete(name="Protagonist Country",
                         breaks=pair,
                         labels=map_country(pair)) +
    scale_shape_discrete(name="Protagonist Country",
                         breaks=pair,
                         solid=F,
                         labels=map_country(pair)) 
    # scale_x_discrete(labels=map_country)
  
  print(p + ggtitle(paste(map_country(pair), collapse = " & ")))

  ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".eps"), collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".pdf"), collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  
}

## [1] "Ukraine & Russia"

## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

## [1] "Israel & Palestine"

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "Brazil & Germany"

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "United States & Great Britain"

## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 2 rows containing missing values (geom_point).

## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 2 rows containing missing values (geom_point).

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 2 rows containing missing values (geom_point).

Export interest vectors

countries <- unique(part.wrt.all$c.protagonist)
for(c in countries) {
  x = part.wrt.all[c.protagonist == c]
  f = paste(c("~/galean/scripts/queries/data/interest_vectors/", c, ".txt"), collapse = "")
  
  write.table(x, f, row.names = F, sep = "\t")
}

Cosine similarity

knn.int.cos <- data.table(read.table('~/galean/scripts/queries/data/knn_int_cos_scale.txt', 
                                     header=F, sep='\t', 
                                     stringsAsFactors = T, na.strings = ''))
setnames(knn.int.cos, c("V1", "V2", "V3"), c("c1", "c2", "dist.cos"))

one_nn <- knn.int.cos[dist.cos != 1]
one_nn <- one_nn[one_nn[, .I[dist.cos == max(dist.cos)], by=c1]$V1]
one_nn <- na.omit(one_nn[order(-dist.cos)])

one_nn.table <- one_nn[, list(map_country(c1), map_country(c2), dist.cos)]
one_nn.table <- one_nn.table[order(-dist.cos)]

kable(one_nn.table[1:30]) #, format="latex", booktabs=T)

V1	V2	dist.cos
Australia	Great Britain	0.9828
Great Britain	Australia	0.9828
United States	Great Britain	0.9803
Algeria	Sri Lanka	0.9766
Sri Lanka	Algeria	0.9766
Russia	Ukraine	0.9728
Ukraine	Russia	0.9728
Canada	United States	0.9721
Bosnia and Herzegovina	Honduras	0.9714
Honduras	Bosnia and Herzegovina	0.9714
Croatia	Bosnia and Herzegovina	0.9713
France	Great Britain	0.9708
Taiwan	Bosnia and Herzegovina	0.9705
Niger	Bosnia and Herzegovina	0.9686
Finland	Croatia	0.9684
Costa Rica	Sweden	0.9671
Sweden	Costa Rica	0.9671
Estonia	Kazakhstan	0.9667
Kazakhstan	Estonia	0.9667
Denmark	Croatia	0.9665
Uruguay	Taiwan	0.9660
Zimbabwe	Croatia	0.9650
Ivory Coast	Croatia	0.9638
Ghana	Taiwan	0.9623
China	Australia	0.9608
Singapore	Taiwan	0.9606
Chile	Taiwan	0.9602
Cameroon	Niger	0.9564
Norway	Croatia	0.9563
Guinea	Mali	0.9561

knn.int.cos[c1 %in% c("BR", "DE", "UA", "RU", "IL") & c2 %in% c("BR", "DE", "UA", "RU", "PS")][dist.cos != 1][order(-dist.cos)]

##     c1 c2 dist.cos
##  1: RU UA   0.9728
##  2: UA RU   0.9728
##  3: DE RU   0.9256
##  4: RU DE   0.9256
##  5: IL RU   0.8964
##  6: IL UA   0.8879
##  7: DE UA   0.8733
##  8: UA DE   0.8733
##  9: IL PS   0.8650
## 10: IL DE   0.8543
## 11: BR DE   0.7663
## 12: DE BR   0.7663
## 13: UA PS   0.7189
## 14: RU PS   0.6948
## 15: DE PS   0.6401
## 16: BR RU   0.6239
## 17: RU BR   0.6239
## 18: IL BR   0.5985
## 19: BR UA   0.5605
## 20: UA BR   0.5605
## 21: BR PS   0.3527
##     c1 c2 dist.cos

knn.int.cos.pl <- knn.int.cos[order(-dist.cos)]
knn.int.cos.pl <- knn.int.cos.pl[dist.cos > .95 & dist.cos < 1]
ggplot(knn.int.cos.pl, aes(x=reorder(c1, -dist.cos), y=reorder(c2, -dist.cos),
                       fill="white", label=as.character(round(dist.cos, 2)))) + 
  geom_tile(size=.1, color="white") + fte_theme() +
  geom_text(size=2, color=palette[9]) +
  scale_x_discrete(labels=map_country) + scale_y_discrete(labels=map_country) +
  theme(axis.text.x=element_text(size=7, angle=45, hjust=1),
        axis.text.y=element_text(size=7)) + 
  labs(x="Country", y="Country")

selected_pairs = list(c("AU", "GB"), c("US", "GB"), c("DZ", "LK"), 
                      c("RU", "UA"), c("CA", "US"))

for(pair in selected_pairs) {
  print(paste(map_country(pair), collapse = " & "))
  tmp <- part.wrt.all[c.protagonist %in% pair]
  p <- ggplot(tmp,
              aes(x=reorder(c.interested, sc.events), 
                  y=sc.events, 
                  color=factor(c.protagonist, levels=pair),
                  shape=factor(c.protagonist, levels=pair))) + 
    geom_point(size=1, position="jitter", alpha=1) + 
    geom_smooth(aes(group=c.protagonist), size=.5) +
    # geom_line(aes(group=c.protagonist)) +
    # geom_smooth(aes(x=reorder(c.interested, avg.interest), y=avg.interest, group=c.protagonist), size=.5) +
    xlab("Interested countries") + 
    ylab("Interest") +
    fte_theme() + theme(axis.text.x = element_blank()) +
    theme(legend.justification=c(0, 1), legend.position=c(0,1),
          legend.background = element_rect(fill="transparent"),
          legend.key.height=unit(1.8,"line"),
          axis.text=element_text(size=6)) +
    # scale_y_continuous(breaks=seq(0, 1, by=0.1), limits=c(0, 1.1)) +
    scale_color_discrete(name="Protagonist Country",
                         breaks=pair,
                         labels=map_country(pair)) +
    scale_shape_discrete(name="Protagonist Country",
                         breaks=pair,
                         solid=F,
                         labels=map_country(pair)) 
    # scale_x_discrete(labels=map_country)
  
  print(p + ggtitle(paste(map_country(pair), collapse = " & ")))

  ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".eps"), collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".pdf"), collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  
}

## [1] "Australia & Great Britain"

## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

## [1] "United States & Great Britain"

## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 2 rows containing missing values (geom_point).

## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 2 rows containing missing values (geom_point).

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 2 rows containing missing values (geom_point).

## [1] "Algeria & Sri Lanka"

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "Russia & Ukraine"

## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

## [1] "Canada & United States"

## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

Euclidean distance

knn.int.euc <- data.table(read.table('~/galean/scripts/queries/data/knn_int_euc_scale.txt', 
                                     header=F, sep='\t', 
                                     stringsAsFactors = T, na.strings = ''))
setnames(knn.int.euc, c("V1", "V2", "V3"), c("c1", "c2", "dist.euc"))

one_nn <- knn.int.euc[dist.euc != 0]
one_nn <- one_nn[one_nn[, .I[dist.euc == min(dist.euc)], by=c1]$V1]
one_nn <- na.omit(one_nn[order(dist.euc)])

setnames(one_nn, c("c1"), c("country_code"))
tmp <- one_nn %>% left_join(prot.by_country, "country_code")
setnames(tmp, c("country_code", "c2"), c("c1", "country_code"))
tmp <- tmp %>% left_join(prot.by_country, "country_code")
setnames(tmp, c("country_code"), c("c2"))

tmp <- tmp[, list(c1 = map_country(c1), c2 = map_country(c2), N.x, N.y, dist.euc)]
tmp <- tmp[order(dist.euc / (N.x * N.y))]

kable(tmp[N.x >= 166.41 & N.y >= 166.41][order(dist.euc)])#, format="latex", booktabs=T)

c1	c2	N.x	N.y	dist.euc
Turkey	Indonesia	198	172	1.1442
Yemen	Turkey	202	198	1.3416
Afghanistan	Turkey	323	198	1.5304
Libya	Turkey	253	198	1.6050
Palestine	Egypt	360	316	1.6496
Egypt	Palestine	316	360	1.6496
Malaysia	Turkey	262	198	1.8096
Japan	Spain	354	258	1.8327
Spain	Japan	258	354	1.8327
Italy	Japan	315	354	1.9018
Brazil	Spain	236	258	1.9060
Pakistan	Germany	453	371	2.0674
Germany	Pakistan	371	453	2.0674
Syria	Israel	647	561	2.4463
Israel	Syria	561	647	2.4463
Ukraine	Russia	921	823	2.5557
Russia	Ukraine	823	921	2.5557
Nigeria	Pakistan	412	453	2.5822
China	Canada	646	715	2.6025
Canada	China	715	646	2.6025
Iran	Syria	496	647	2.6838
Iraq	Iran	654	496	2.9270
France	Canada	627	715	3.7859
Australia	France	974	627	4.1398
India	Australia	1561	974	4.8339
Great Britain	India	4015	1561	41.7719
United States	Great Britain	10162	4015	97.2733

# one_nn.table <- one_nn[, list(map_country(c1), map_country(c2), dist.euc)]
# one_nn.table <- one_nn.table[order(dist.euc)]
# 
# kable(one_nn.table[1:30]) #, format="latex", booktabs=T)

knn.int.euc[c1 %in% c("BR", "DE", "UA", "RU", "IL") & c2 %in% c("BR", "DE", "UA", "RU", "PS")][dist.euc != 0][order(dist.euc)]

##     c1 c2 dist.euc
##  1: RU UA   2.5557
##  2: UA RU   2.5557
##  3: BR DE   2.8704
##  4: DE BR   2.8704
##  5: BR PS   3.2902
##  6: DE PS   3.3507
##  7: IL DE   4.1893
##  8: IL RU   4.2992
##  9: IL PS   4.8173
## 10: IL UA   5.2256
## 11: DE RU   5.5783
## 12: RU DE   5.5783
## 13: IL BR   6.0385
## 14: DE UA   6.9912
## 15: UA DE   6.9912
## 16: RU PS   7.5344
## 17: BR RU   7.9699
## 18: RU BR   7.9699
## 19: UA PS   8.5056
## 20: BR UA   9.2345
## 21: UA BR   9.2345
##     c1 c2 dist.euc

knn.int.cos.pl <- knn.int.cos[order(dist.cos)]
knn.int.cos.pl <- knn.int.cos.pl[dist.cos < .5 & dist.cos > 0]
ggplot(knn.int.cos.pl, aes(x=reorder(c1, dist.cos), y=reorder(c2, dist.cos),
                       fill="white", label=as.character(round(dist.cos, 2)))) + 
  geom_tile(size=.1, color="white") + fte_theme() +
  geom_text(size=2, color=palette[9]) +
  scale_x_discrete(labels=map_country) + scale_y_discrete(labels=map_country) +
  theme(axis.text.x=element_text(size=7, angle=45, hjust=1),
        axis.text.y=element_text(size=7)) + 
  labs(x="Country", y="Country")

selected_pairs = list(c("AF", "TR"), c("PK", "IR"), c("EG", "AF"), 
                      c("JP", "IR"), c("MY", "PK"), c("BR", "DE"),
                      c("US", "GB"), c("IL", "PS"), c("UA", "RU"),
                      c("KP", "KR"), c("ID", "TR"), c("YE", "TR"))

for(pair in selected_pairs) {
  print(paste(map_country(pair), collapse = " & "))
  tmp <- part.wrt.all[c.protagonist %in% pair]
  p <- ggplot(tmp,
              aes(x=reorder(c.interested, sc.events), 
                  y=sc.events, 
                  color=factor(c.protagonist, levels=pair),
                  shape=factor(c.protagonist, levels=pair))) + 
    geom_point(size=1, position="jitter", alpha=1) + 
    geom_smooth(aes(group=c.protagonist), size=.5) +
    # geom_line(aes(group=c.protagonist)) +
    # geom_smooth(aes(x=reorder(c.interested, avg.interest), y=avg.interest, group=c.protagonist), size=.5) +
    xlab("Interested countries") + 
    ylab("Standard deviations from the mean") +
    fte_theme() + theme(axis.text.x = element_blank()) +
    theme(legend.justification=c(0, 1), legend.position=c(0,1),
          legend.background = element_rect(fill="transparent"),
          legend.key.height=unit(1.8,"line"),
          axis.text=element_text(size=6)) +
    scale_y_continuous(breaks=seq(-.5, 2, by=0.25), limits=c(-.5, 2)) +
    scale_color_discrete(name="Protagonist Country",
                         breaks=pair,
                         labels=map_country(pair)) +
    scale_shape_discrete(name="Protagonist Country",
                         breaks=pair,
                         solid=F,
                         labels=map_country(pair)) 
    # scale_x_discrete(labels=map_country)
  
  print(p + ggtitle(paste(map_country(pair), collapse = " & ")))

  ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".eps"), collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".pdf"), collapse = ""), 
         p, dpi=pl.DPI, width=pl.W-2, height=pl.H)
  
}

## [1] "Afghanistan & Turkey"

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "Pakistan & Iran"

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "Egypt & Afghanistan"

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "Japan & Iran"

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "Malaysia & Pakistan"

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "Brazil & Germany"

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "United States & Great Britain"

## Warning: Removed 446 rows containing non-finite values (stat_smooth).

## Warning: Removed 447 rows containing missing values (geom_point).

## Warning: Removed 446 rows containing non-finite values (stat_smooth).

## Warning: Removed 446 rows containing missing values (geom_point).

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## Warning: Removed 446 rows containing non-finite values (stat_smooth).

## Warning: Removed 446 rows containing missing values (geom_point).

## [1] "Israel & Palestine"

## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 2 rows containing missing values (geom_point).

## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 2 rows containing missing values (geom_point).

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 2 rows containing missing values (geom_point).

## [1] "Ukraine & Russia"

## Warning: Removed 5 rows containing non-finite values (stat_smooth).

## Warning: Removed 5 rows containing missing values (geom_point).

## Warning: Removed 5 rows containing non-finite values (stat_smooth).

## Warning: Removed 5 rows containing missing values (geom_point).

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## Warning: Removed 5 rows containing non-finite values (stat_smooth).

## Warning: Removed 5 rows containing missing values (geom_point).

## [1] "North Korea & South Korea"

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "Indonesia & Turkey"

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "Yemen & Turkey"

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

Norm 1 (absolute difference)

knn.int.abs <- data.table(read.table('~/galean/scripts/queries/data/knn_int_abs_scale.txt', 
                                     header=F, sep='\t', 
                                     stringsAsFactors = T, na.strings = ''))
setnames(knn.int.abs, c("V1", "V2", "V3"), c("c1", "c2", "dist.abs"))

one_nn <- knn.int.abs[dist.abs != 0]
one_nn <- one_nn[one_nn[, .I[dist.abs == min(dist.abs)], by=c1]$V1]
one_nn <- na.omit(one_nn[order(dist.abs)])

setnames(one_nn, c("c1"), c("country_code"))
tmp <- one_nn %>% left_join(prot.by_country, "country_code")
setnames(tmp, c("country_code", "c2"), c("c1", "country_code"))
tmp <- tmp %>% left_join(prot.by_country, "country_code")
setnames(tmp, c("country_code"), c("c2"))

tmp <- tmp[, list(c1 = map_country(c1), c2 = map_country(c2), N.x, N.y, dist.abs)]
tmp <- tmp[order(dist.abs / (N.x * N.y))]

kable(tmp[N.x + N.y >= 500][order(dist.abs)])

c1	c2	N.x	N.y	dist.abs
Libya	Afghanistan	253	323	12.8633
Afghanistan	Libya	323	253	12.8633
Palestine	Egypt	360	316	14.5588
Egypt	Palestine	316	360	14.5588
Japan	Italy	354	315	14.8907
Italy	Japan	315	354	14.8907
Malaysia	Afghanistan	262	323	16.0362
Spain	Italy	258	315	16.3063
Pakistan	Germany	453	371	19.6107
Germany	Pakistan	371	453	19.6107
Syria	Israel	647	561	21.4050
Israel	Syria	561	647	21.4050
Iran	Israel	496	561	22.6966
Iraq	Syria	654	647	23.1034
Ukraine	Russia	921	823	23.4717
Russia	Ukraine	823	921	23.4717
Nigeria	Pakistan	412	453	23.7334
China	Canada	646	715	25.6509
Canada	China	715	646	25.6509
France	Canada	627	715	38.3235
India	Australia	1561	974	41.0575
Australia	India	974	1561	41.0575
Great Britain	India	4015	1561	553.0469
United States	Great Britain	10162	4015	1314.0598

# knn.int.abs[c1 %in% c("BR", "DE", "UA", "RU", "IL") & c2 %in% c("BR", "DE", "UA", "RU", "PS")][dist.abs != 0][order(dist.abs)]

knn.int.cos.pl <- knn.int.cos[order(dist.cos)]
knn.int.cos.pl <- knn.int.cos.pl[dist.cos < .5 & dist.cos > 0]
ggplot(knn.int.cos.pl, aes(x=reorder(c1, dist.cos), y=reorder(c2, dist.cos),
                       fill="white", label=as.character(round(dist.cos, 2)))) + 
  geom_tile(size=.1, color="white") + fte_theme() +
  geom_text(size=2, color=palette[9]) +
  scale_x_discrete(labels=map_country) + scale_y_discrete(labels=map_country) +
  theme(axis.text.x=element_text(size=7, angle=45, hjust=1),
        axis.text.y=element_text(size=7)) + 
  labs(x="Country", y="Country")

selected_pairs = list(c("IT", "ES"), c("NG", "IT"), c("EG", "AF"), 
                      c("JP", "IR"), c("MY", "PK"))

for(pair in selected_pairs) {
  print(paste(map_country(pair), collapse = " & "))
  tmp <- part.wrt.all[c.protagonist %in% pair]
  p <- ggplot(tmp,
              aes(x=reorder(c.interested, sc.events), 
                  y=sc.events, 
                  color=factor(c.protagonist, levels=pair),
                  shape=factor(c.protagonist, levels=pair))) + 
    geom_point(size=1, position="jitter", alpha=1) + 
    geom_smooth(aes(group=c.protagonist)) +
    # geom_line(aes(group=c.protagonist)) +
    # geom_smooth(aes(x=reorder(c.interested, avg.interest), y=avg.interest, group=c.protagonist), size=.5) +
    xlab("Interested countries") + 
    ylab("Interest") +
    fte_theme() + theme(axis.text.x = element_blank()) +
    theme(legend.justification=c(0, 1), legend.position=c(0,1),
          legend.background = element_rect(fill="transparent"),
          legend.key.height=unit(1.8,"line"),
          axis.text=element_text(size=6)) +
    # scale_y_continuous(breaks=seq(0, 1, by=0.1), limits=c(0, 1.1)) +
    scale_color_discrete(name="Protagonist Country",
                         breaks=pair,
                         labels=map_country(pair)) +
    scale_shape_discrete(name="Protagonist Country",
                         breaks=pair,
                         solid=F,
                         labels=map_country(pair)) 
    # scale_x_discrete(labels=map_country)
  
  print(p + ggtitle(paste(map_country(pair), collapse = " & ")))

  ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".eps"), collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".pdf"), collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  
}

## [1] "Italy & Spain"

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "Nigeria & Italy"

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "Egypt & Afghanistan"

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "Japan & Iran"

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "Malaysia & Pakistan"

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

List of country codes

kable(countries_list[order(country.code)])

country.code	country.name
AD	Andorra
AE	United Arab Emirates
AF	Afghanistan
AG	Antigua and Barbuda
AI	Anguilla
AL	Albania
AM	Armenia
AN	Netherlands Antilles
AO	Angola
AQ	Antarctica
AR	Argentina
AS	American Samoa
AT	Austria
AU	Australia
AW	Aruba
AX	Åland
AZ	Azerbaijan
BA	Bosnia and Herzegovina
BB	Barbados
BD	Bangladesh
BE	Belgium
BF	Burkina Faso
BG	Bulgaria
BH	Bahrain
BI	Burundi
BJ	Benin
BL	Saint Barthélemy
BM	Bermuda
BN	Brunei
BO	Bolivia
BQ	Bonaire
BR	Brazil
BS	Bahamas
BT	Bhutan
BV	Bouvet Island
BW	Botswana
BY	Belarus
BZ	Belize
CA	Canada
CC	Cocos [Keeling] Islands
CD	Democratic Republic of the Congo
CF	Central African Republic
CG	Republic of the Congo
CH	Switzerland
CI	Ivory Coast
CK	Cook Islands
CL	Chile
CM	Cameroon
CN	China
CO	Colombia
CR	Costa Rica
CS	Serbia
CU	Cuba
CV	Cape Verde
CW	Curacao
CX	Christmas Island
CY	Cyprus
CZ	Czechia
DE	Germany
DJ	Djibouti
DK	Denmark
DM	Dominica
DO	Dominican Republic
DZ	Algeria
EC	Ecuador
EE	Estonia
EG	Egypt
EH	Western Sahara
ER	Eritrea
ES	Spain
ET	Ethiopia
FI	Finland
FJ	Fiji
FK	Falkland Islands
FM	Micronesia
FO	Faroe Islands
FR	France
GA	Gabon
GB	Great Britain
GD	Grenada
GE	Georgia
GF	French Guiana
GG	Guernsey
GH	Ghana
GI	Gibraltar
GL	Greenland
GM	Gambia
GN	Guinea
GP	Guadeloupe
GQ	Equatorial Guinea
GR	Greece
GS	South Georgia and the South Sandwich Islands
GT	Guatemala
GU	Guam
GW	Guinea-Bissau
GY	Guyana
HK	Hong Kong
HM	Heard Island and McDonald Islands
HN	Honduras
HR	Croatia
HT	Haiti
HU	Hungary
ID	Indonesia
IE	Ireland
IL	Israel
IM	Isle of Man
IN	India
IO	British Indian Ocean Territory
IQ	Iraq
IR	Iran
IS	Iceland
IT	Italy
JE	Jersey
JM	Jamaica
JO	Jordan
JP	Japan
KE	Kenya
KG	Kyrgyzstan
KH	Cambodia
KI	Kiribati
KM	Comoros
KN	Saint Kitts and Nevis
KP	North Korea
KR	South Korea
KW	Kuwait
KY	Cayman Islands
KZ	Kazakhstan
LA	Laos
LB	Lebanon
LC	Saint Lucia
LI	Liechtenstein
LK	Sri Lanka
LR	Liberia
LS	Lesotho
LT	Lithuania
LU	Luxembourg
LV	Latvia
LY	Libya
MA	Morocco
MC	Monaco
MD	Moldova
ME	Montenegro
MF	Saint Martin
MG	Madagascar
MH	Marshall Islands
MK	Macedonia
ML	Mali
MM	Myanmar [Burma]
MN	Mongolia
MO	Macao
MP	Northern Mariana Islands
MQ	Martinique
MR	Mauritania
MS	Montserrat
MT	Malta
MU	Mauritius
MV	Maldives
MW	Malawi
MX	Mexico
MY	Malaysia
MZ	Mozambique
NC	New Caledonia
NE	Niger
NF	Norfolk Island
NG	Nigeria
NI	Nicaragua
NL	Netherlands
NO	Norway
NP	Nepal
NR	Nauru
NU	Niue
NZ	New Zealand
OM	Oman
PA	Panama
PE	Peru
PF	French Polynesia
PG	Papua
PH	Philippines
PK	Pakistan
PL	Poland
PM	Saint Pierre and Miquelon
PN	Pitcairn Islands
PR	Puerto Rico
PS	Palestine
PT	Portugal
PW	Palau
PY	Paraguay
QA	Qatar
RE	Réunion
RO	Romania
RS	Serbia
RU	Russia
RW	Rwanda
SA	Saudi Arabia
SB	Solomon Islands
SC	Seychelles
SD	Sudan
SE	Sweden
SG	Singapore
SH	Saint Helena
SI	Slovenia
SJ	Svalbard and Jan Mayen
SK	Slovakia
SL	Sierra Leone
SM	San Marino
SN	Senegal
SO	Somalia
SR	Suriname
SS	South Sudan
ST	São Tomé and Príncipe
SV	El Salvador
SX	Sint Maarten
SY	Syria
SZ	Swaziland
TC	Turks and Caicos Islands
TD	Chad
TF	French Southern Territories
TG	Togo
TH	Thailand
TJ	Tajikistan
TK	Tokelau
TL	East Timor
TM	Turkmenistan
TN	Tunisia
TO	Tonga
TR	Turkey
TT	Trinidad and Tobago
TV	Tuvalu
TW	Taiwan
TZ	Tanzania
UA	Ukraine
UG	Uganda
UM	U.S. Minor Outlying Islands
US	United States
UY	Uruguay
UZ	Uzbekistan
VA	Vatican City
VC	Saint Vincent and the Grenadines
VE	Venezuela
VG	British Virgin Islands
VI	U.S. Virgin Islands
VN	Vietnam
VU	Vanuatu
WF	Wallis and Futuna
YE	Yemen
YT	Mayotte
ZA	South Africa
ZM	Zambia
ZW	Zimbabwe
NA	Namibia

Geopolitical analysis of news events

Mauricio Quezada

Last updated: 2016-04-15

Load data

Summaries & filtering

Measure bias

Events being protagonized

Events a country is interested in

Tweets per interested country

Users per location

Normalization

(Co-)Protagonism (relative)

Co-protagonism of countries

Comparison between countries

Similar countries

(Co-)Protagonism (symmetric)

Interest (relative to max)

Distribution of interest

Comparison between countries

Similar countries

Events with most interest

Interest (relative to sum)

Distribution of interest

Comparison between countries

Similar countries

Events with most interest

Self-interest

Interest (relative to mean)

Export interest vectors

Cosine similarity

Euclidean distance

Norm 1 (absolute difference)

List of country codes