Load data

# message = F, warning = F
library(tidyr)
library(dplyr)
library(ggplot2)
library(data.table)
library(knitr)
library(bit64)
library(extrafont)
library(scales)
library(grid)
library(RColorBrewer)
palette <- brewer.pal("Greys", n=9)

Protagonist countries

# component locations raw data (protagonist countries)
protagonists <- read.table('~/galean/scripts/queries/data/componentlocation.tsv', sep='\t', header=T, na.strings = '', stringsAsFactors = T)
prot.dt <- data.table(protagonists)
# write.table(prot.dt[, component_id, country_code], 'scripts/queries/data/country_component.txt', row.names=F)
prot <- prot.dt %>% 
  select(component_id, country_code, frequency) %>% 
  arrange(-frequency)

Interested countries

participants <- read.table('~/galean/scripts/queries/data/participation_data.txt', sep='\t', header=T, stringsAsFactors = F, na.strings = '')
part.tidy <- participants %>% 
  gather(country, tweets, -component_id) %>% 
  mutate(country = (toupper(country))) %>%
  arrange(component_id, -tweets)

part.tidy$country[part.tidy$country == "IN."] <- "IN"
part.tidy$country <- factor(part.tidy$country)
part.tidy <- data.table(part.tidy)

Countries List

countries_list <- data.table(read.table('~/galean/scripts/queries/data/countries.txt', sep='\t', header=F, stringsAsFactors = T))
setnames(countries_list, c("V1", "V2"), c("country.code", "country.name"))

Summaries & filtering

Overall summary:

events <- 25481
tweets <- 193447671
users <- 26127625

summ <- data.frame(events, tweets, users)
kable(summ, row.names = FALSE, format.args=list(big.mark=','))
events tweets users
25,481 193,447,671 26,127,625

Protagonist countries:

# component_id: event
# frequency: no. of times the country was mentioned in the tweets of the event
head(prot)
##    component_id country_code frequency
## 1:        17658           US    584530
## 2:        24041           NP    547669
## 3:        17579           FR    349313
## 4:        17406           FR    345206
## 5:        24089           NP    341518
## 6:        24391           GB    245290
summary(prot)
##   component_id    country_code     frequency     
##  Min.   :    1   US     :10162   Min.   :    30  
##  1st Qu.: 7665   GB     : 4015   1st Qu.:   100  
##  Median :15180   IN     : 1561   Median :   285  
##  Mean   :14132   AU     :  974   Mean   :  1918  
##  3rd Qu.:20635   UA     :  921   3rd Qu.:   982  
##  Max.   :25481   RU     :  823   Max.   :584530  
##                  (Other):14161
# ggplot(prot, aes(x=1, y=frequency)) + geom_boxplot()

Interested countries:

head(part.tidy)
##    component_id country tweets
## 1:            1      US    319
## 2:            1      CO     30
## 3:            1      GB     18
## 4:            1      ID      7
## 5:            1      CA      6
## 6:            1      JP      5
summary(part.tidy)
##   component_id      country            tweets         
##  Min.   :    1   AD     :  20066   Min.   :     0.00  
##  1st Qu.: 6638   AE     :  20066   1st Qu.:     0.00  
##  Median :13566   AF     :  20066   Median :     0.00  
##  Mean   :13181   AG     :  20066   Mean   :     6.63  
##  3rd Qu.:19738   AI     :  20066   3rd Qu.:     0.00  
##  Max.   :25481   AL     :  20066   Max.   :108599.00  
##                  (Other):4896104
# ggplot(part.tidy, aes(x=1, y=tweets)) + geom_boxplot()

Remove countries with few protagonized events (< median) and remove events with few tweets (< median):

# prot.events_per_country <- prot[, .N, by=country_code]
# summary(prot.events_per_country$N)
# to_remove <- prot.events_per_country[N < 18.5]$country_code
# prot <- prot[!(country_code %in% to_remove)]

# summary(prot$frequency)
# to_remove <- prot[frequency < 288]$component_id
# prot <- prot[!(component_id %in% to_remove)]

Measure bias

Events being protagonized

prot.per_country <- prot[, .N, by=country_code]
n <- nrow(prot.per_country)
prot.per_country.top25 <- prot.per_country[order(N), ][(n-25):n]

p <- ggplot(prot.per_country.top25, aes(x=factor(country_code, levels=country_code), y=N, label=comma(N))) +
  geom_bar(stat="identity") + coord_flip() + geom_text(size=2.5, hjust=-.1, color=palette[6]) +
  scale_y_continuous(labels=comma, breaks=seq(0, 12000, by=1000), limits = c(0, 12000)) +
  scale_x_discrete(labels=map_country) +
  fte_theme() +  geom_hline(yintercept=0, size=0.4, color="black") +
  xlab("Country") + ylab("Events") 

print(p + ggtitle("Top 25 countries protagonizing events"))

ggsave(paste(c(PDF_PATH, "protagonist-bias.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "protagonist-bias.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)

Events a country is interested in

quantile(part.tidy$tweets, c(.8, .9, .95, .99, .999, .9999))
##      80%      90%      95%      99%    99.9%   99.99% 
##    0.000    2.000    8.000   77.000 1026.000 6053.901
# use .99 percentile
part.tidy.p99 <- part.tidy[tweets >= 77, ]
part.per_country <- part.tidy.p99[, .N, by=country]
n <- nrow(part.per_country)
part.per_country.top25 <- part.per_country[order(N)[(n - 25):n], ]


p <- ggplot(part.per_country.top25, aes(x=factor(country, levels=country), y=N, label=comma(N))) +
  geom_bar(stat="identity") + coord_flip() +
  geom_text(size=2.5, hjust=-.1, color=palette[6]) +
  scale_y_continuous(labels=comma, breaks=seq(0, 12000, by=1000), limits = c(0, 12000)) +
  scale_x_discrete(labels=map_country) +
  fte_theme() +  geom_hline(yintercept=0, size=0.4, color="black") +
  xlab("Country") + ylab("Events")

print(p + ggtitle("Top 25 countries interested in events"))

ggsave(paste(c(PDF_PATH, "interest-bias.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "interest-bias.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)

Tweets per interested country

part.sum_per_country <- part.tidy.p99[, sum(tweets), by=country]
n <- nrow(part.sum_per_country)
part.sum_per_country.top25 <- part.sum_per_country[order(V1)[(n-25):n]]

p <- ggplot(part.sum_per_country.top25, aes(x=factor(country, levels=country), y=V1, label=comma(V1))) +
  geom_bar(stat="identity") + coord_flip() +
  geom_text(size=2.5, hjust=-.1, color=palette[6]) +
  scale_y_continuous(labels=comma, breaks=seq(0, 14e6, by=2e6), limits = c(0, 14e6)) +
  scale_x_discrete(labels=map_country) +
  fte_theme() +  geom_hline(yintercept=0, size=0.4, color="black") +
  xlab("Country") + ylab("Tweets")

print(p + ggtitle("Top 25 countries with tweets"))

ggsave(paste(c(PDF_PATH, "tweets-per-country.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "tweets-per-country.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)

Users per location

For most of the users, it was not possible to identify a location. The rest is distributed mostly among US, GB, CA and ID (Indonesia).

user_countries <- read.table('~/galean/scripts/queries/data/locations_distribution.txt', sep='\t', stringsAsFactors = FALSE, na.strings = "")
user_countries <- user_countries %>%
  mutate(country = toupper(V1),
         frequency = V2) %>%
  select(country, frequency)
user_countries <- data.table(user_countries)
user_countries <- user_countries[order(frequency)]
n <- nrow(user_countries)
user_countries_pl <- user_countries[(n - 25):n]
p <- ggplot(user_countries_pl, aes(x=factor(country, levels=country), y=frequency, label=comma(frequency))) + 
  geom_bar(stat="identity") + coord_flip() + xlab("Country") + ylab("Users") +
  geom_text(size=2.5, hjust=-.1, color=palette[6]) +
  scale_y_continuous(labels=comma, breaks=seq(0, 17.5e6, by=2e6), limits = c(0, 17.5e6)) +
  scale_x_discrete(labels=map_country) +
  fte_theme() +  geom_hline(yintercept=0, size=0.4, color="black") 
  
print(p + ggtitle("Users per country tweeting about events"))

ggsave(paste(c(PDF_PATH, "users-per-country.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "users-per-country.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
p <- ggplot(user_countries_pl[country != "<NA>"], 
            aes(x=factor(country, levels=country), y=frequency, label=comma(frequency))) + 
  geom_bar(stat="identity") + coord_flip() + xlab("Country") + ylab("Users") +
  geom_text(size=2.5, hjust=-.1, color=palette[6]) +
  scale_y_continuous(labels=comma, breaks=seq(0, 4e6, by=.5e6), limits = c(0, 4e6)) +
  scale_x_discrete(labels=map_country) +
  fte_theme() +  geom_hline(yintercept=0, size=0.4, color="black") 
  
print(p + ggtitle("Users per country tweeting about events"))

ggsave(paste(c(PDF_PATH, "users-per-country-f.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "users-per-country-f.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)

Normalization

(Co-)Protagonism (relative)

A country is represented as a vector with respect to all other countries \(C\):

\[c_i = \Big( \frac{\text{total events protagonized by } c_i \text{ and } c_j}{\text{total events protagonized by } c_i}, c_j \in C \Big) \]

countries <- unique(prot, by=c("country_code"))[, country_code]
coprot <- prot %>% left_join(prot, by="component_id")

# do not consider self protagonism in events with more than 1 protagonist
# ...no se para qué, no lo uso
# coprot <- coprot[coprot[, country_code.x != country_code.y | .N == 1, by=component_id]$V1]

# eventos protagonizados por pais
prot.by_country <- prot[, .N, by=country_code]

coprot.vector <- list()
for(country in countries) {
  total_events_ci <- prot.by_country[country_code == country, N]
  coprot_i <- coprot[country_code.x == country]
  coprot_freq <- coprot_i[, .N, by=country_code.y]
  coprot.vector[[country]] <- coprot_freq[, per := coprot_freq$N / total_events_ci]
}

Co-protagonism of countries

Bar length represents the fraction of events protagonized by each country, with respect to total events protagonized by main country (in the title of each plot).

selected_countries <- c("US", "GB", "CA", "NG", "UA", "RU", "JP", "CN", "PS", "IL")


for(country in selected_countries) {
  tmp <- coprot.vector[[country]][country_code.y != country][order(per)]
  n <- nrow(tmp)
  p <- ggplot(tmp[(n - 15): n], aes(x=factor(country_code.y, levels=country_code.y), 
                                    y=per, label=as.character(round(per, 4)))) + 
    geom_bar(stat="identity") + coord_flip() + xlab("Country") + ylab("Co-protagonism") +
    geom_text(size=2.5, hjust=-.1, color=palette[6]) +
    scale_y_continuous(breaks=seq(0, 1, by=.125), limits = c(0, .5)) +
    scale_x_discrete(labels=map_country) +
    fte_theme() +  geom_hline(yintercept=0, size=0.4, color="black") 
  
  print(p + ggtitle(map_country(country)))
  ggsave(paste(c(PDF_PATH, "co-prot-", country, ".eps"), collapse = ""), p, 
         dpi=pl.DPI/1.5, width=pl.W-3, height=pl.H-1)
  ggsave(paste(c(PDF_PATH, "co-prot-", country, ".pdf"), collapse = ""), p, 
         dpi=pl.DPI/1.5, width=pl.W-3, height=pl.H-1) 
}

## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (geom_text).
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (geom_text).
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (geom_text).

## Warning: Removed 1 rows containing missing values (position_stack).

## Warning: Removed 1 rows containing missing values (geom_text).
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (geom_text).
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (geom_text).

tmp <- data.table(prot %>% left_join(prot, "component_id"))[, mean(length(component_id)), by=country_code.x]
tmp <- tmp[, per:=tmp$V1 / sum(tmp$V1)][order(per)]

n <- nrow(tmp)
p <- ggplot(tmp[(n - 15): n], aes(x=factor(country_code.x, levels=country_code.x), 
                                  y=per, label=as.character(round(per, 4)))) + 
  geom_bar(stat="identity") + coord_flip() + xlab("Country") + ylab("Co-protagonism") +
  geom_text(size=2.5, hjust=-.1, color=palette[6]) +
  scale_y_continuous(breaks=seq(0, 1, by=.125), limits = c(0, .5)) +
  scale_x_discrete(labels=map_country) +
  fte_theme() +  geom_hline(yintercept=0, size=0.4, color="black") 

print(p + ggtitle("All"))

ggsave(paste(c(PDF_PATH, "co-prot-all", ".eps"), collapse = ""), p, 
       dpi=pl.DPI/1.5, width=pl.W-3, height=pl.H-1)
ggsave(paste(c(PDF_PATH, "co-prot-all", ".pdf"), collapse = ""), p, 
       dpi=pl.DPI/1.5, width=pl.W-3, height=pl.H-1) 

Comparison between countries

Remake plots from Sec 6.1:

prot.comp <- list()
to_compare <- c("EC", "HN", "IL", "PS", "KP", "KR", "BD", "US", "GB", "UA", "RU", "MY", "VN", "CN", "JP", "CA", "NG")

for(cnt in to_compare) {
  covector <- c()
  for(i in seq(length(countries))) {
    c_i <- as.character(countries[i])
    
    per <- coprot.vector[[cnt]][country_code.y == c_i, per]
    if(length(per) == 0) {
      covector <- c(covector, 0)
    } else {
      covector <- c(covector, per)
    }
  }
  names(covector) <- countries
  prot.comp[[cnt]] <- covector
}


prot.comp_sel <- data.table(EC=prot.comp[["EC"]], 
                            HN=prot.comp[["HN"]], 
                            IL=prot.comp[["IL"]],
                            PS=prot.comp[["PS"]],
                            KP=prot.comp[["KP"]],
                            KR=prot.comp[["KR"]],
                            BD=prot.comp[["BD"]],
                            US=prot.comp[["US"]],
                            GB=prot.comp[["GB"]],
                            UA=prot.comp[["UA"]],
                            RU=prot.comp[["RU"]],
                            MY=prot.comp[["MY"]],
                            VN=prot.comp[["VN"]],
                            CN=prot.comp[["CN"]],
                            JP=prot.comp[["JP"]],
                            CA=prot.comp[["CA"]],
                            NG=prot.comp[["NG"]],
                            country=countries)

prot.comp_sel2 <- prot.comp_sel %>%
  gather(y, value, -country) %>%
  mutate(x = country) %>%
  select(x, y, value)
prot.comp_sel2 <- data.table(prot.comp_sel2)

pairs <- list(c("EC", "HN"), c("IL", "PS"), c("KP", "KR"), c("BD", "US"), c("GB", "US"),
              c("RU", "UA"), c("EC", "US"), c("IL", "KP"), c("CN", "MY"), c("JP", "CN"),
              c("CA", "NG"))

for(pair in pairs) {
  print(paste(map_country(pair), collapse = " & "))
  tmp <- prot.comp_sel2[y %in% pair]
  p <- ggplot(tmp, aes(x=reorder(x, value), 
                       y=value, 
                       group=factor(y, levels=pair),
                       color=factor(y, levels=pair),
                       shape=factor(y, levels=pair))) +
    geom_point(size=1, position="jitter", alpha=1) + 
    # geom_smooth(aes(x=reorder(x, value), y=value, group=y), size=.5) +
    xlab("Countries") + 
    ylab("Co-protagonism") +
    fte_theme() +
    # scale_x_discrete(labels=c())
    theme(legend.justification=c(0, 1), legend.position=c(0,1),
          legend.background = element_rect(fill="transparent"),
          legend.key.height=unit(1.8,"line"),
          axis.text=element_text(size=6),
          axis.text.x = element_blank()) + 
    scale_y_continuous(breaks=seq(0, 1, by=0.1), limits=c(0, 1.1)) +
    scale_color_discrete(name="Protagonist Country",
                         breaks=pair,
                         labels=map_country(pair)) +
    scale_shape_discrete(name="Protagonist Country",
                         solid = F,
                         breaks=pair,
                         labels=map_country(pair))
    # scale_x_discrete(labels=map_country)
  
  print(p + ggtitle(paste(map_country(pair), collapse = " & ")))
  
  ggsave(paste(c(PDF_PATH, paste(c("co-prot", pair), collapse = "-"), ".eps"), collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  ggsave(paste(c(PDF_PATH, paste(c("co-prot", pair), collapse = "-"), ".pdf"), collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
}
## [1] "Ecuador & Honduras"
## Warning: Removed 182 rows containing missing values (geom_point).

## Warning: Removed 182 rows containing missing values (geom_point).

## Warning: Removed 182 rows containing missing values (geom_point).

## [1] "Israel & Palestine"
## Warning: Removed 117 rows containing missing values (geom_point).
## Warning: Removed 107 rows containing missing values (geom_point).
## Warning: Removed 129 rows containing missing values (geom_point).

## [1] "North Korea & South Korea"
## Warning: Removed 124 rows containing missing values (geom_point).
## Warning: Removed 146 rows containing missing values (geom_point).
## Warning: Removed 130 rows containing missing values (geom_point).

## [1] "Bangladesh & United States"
## Warning: Removed 79 rows containing missing values (geom_point).
## Warning: Removed 81 rows containing missing values (geom_point).
## Warning: Removed 76 rows containing missing values (geom_point).

## [1] "Great Britain & United States"
## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 23 rows containing missing values (geom_point).
## Warning: Removed 31 rows containing missing values (geom_point).

## [1] "Russia & Ukraine"
## Warning: Removed 85 rows containing missing values (geom_point).
## Warning: Removed 94 rows containing missing values (geom_point).
## Warning: Removed 99 rows containing missing values (geom_point).

## [1] "Ecuador & United States"
## Warning: Removed 99 rows containing missing values (geom_point).
## Warning: Removed 93 rows containing missing values (geom_point).
## Warning: Removed 87 rows containing missing values (geom_point).

## [1] "Israel & North Korea"
## Warning: Removed 113 rows containing missing values (geom_point).
## Warning: Removed 116 rows containing missing values (geom_point).
## Warning: Removed 122 rows containing missing values (geom_point).

## [1] "China & Malaysia"
## Warning: Removed 109 rows containing missing values (geom_point).
## Warning: Removed 113 rows containing missing values (geom_point).
## Warning: Removed 119 rows containing missing values (geom_point).

## [1] "Japan & China"
## Warning: Removed 105 rows containing missing values (geom_point).
## Warning: Removed 125 rows containing missing values (geom_point).
## Warning: Removed 96 rows containing missing values (geom_point).

## [1] "Canada & Nigeria"
## Warning: Removed 101 rows containing missing values (geom_point).
## Warning: Removed 107 rows containing missing values (geom_point).
## Warning: Removed 91 rows containing missing values (geom_point).

Export vectors to files:

for(c in countries) {
  x = coprot.vector[[c]]
  f = paste(c("~/galean/scripts/queries/data/protagonist_vectors/", c, ".txt"), collapse = "")
  
  write.table(x, f, row.names = F, sep = "\t")
}

Similar countries

Euclidean distance is not suitable for similarity between countries (vectors), due to the high sparsity of the vectors. Cosine similarity is better.

knn.cos <- data.table(read.table('~/galean/scripts/queries/data/knn_cos.txt', header=F, sep='\t', stringsAsFactors = T, na.strings = ''))
knn.euc <- data.table(read.table('~/galean/scripts/queries/data/knn_euc.txt', header=F, sep='\t', stringsAsFactors = T, na.strings = ''))

setnames(knn.cos, c("V1", "V2", "V3"), c("c1", "c2", "dist.cos"))
setnames(knn.euc, c("V1", "V2", "V3"), c("c1", "c2", "dist.euc"))

knn.euc.us <- knn.euc[c1 == "US" & c2 != "US"][order(dist.euc)]
ggplot(knn.euc.us, aes(x=factor(c2, levels=c2), y=1-dist.euc)) + geom_bar(stat="identity") + coord_flip() +
  xlab("Country") + ylab("Euclidean distance") + ggtitle("Most similar countries to US (euclidean distance) (higher is more similar)")

# US
knn.cos.us <- knn.cos[c1 == "US" & c2 != "US"][order(dist.cos)]
ggplot(knn.cos.us, aes(x=factor(c2, levels=c2), y=(1-dist.cos))) + geom_bar(stat="identity") + coord_flip() + 
  xlab("Country") + ylab("Cosine distance") + ggtitle("Most similar countries to US (cosine distance) (higher is more similar)")

# GB
knn.cos.gb <- knn.cos[c1 == "GB" & c2 != "GB"][order(dist.cos)]
ggplot(knn.cos.gb, aes(x=factor(c2, levels=c2), y=1-dist.cos)) + geom_bar(stat="identity") + coord_flip() +
  xlab("Country") + ylab("Cosine distance") + ggtitle("Most similar countries to GB (cosine distance) (higher is more similar)")

# UA
knn.cos.ua <- knn.cos[c1 == "UA" & c2 != "UA"][order(dist.cos)]
ggplot(knn.cos.ua, aes(x=factor(c2, levels=c2), y=1-dist.cos)) + geom_bar(stat="identity") + coord_flip() +
  xlab("Country") + ylab("Cosine distance") + ggtitle("Most similar countries to Ukraine (cosine distance) (higher is more similar)")

# VE
knn.cos.ve <- knn.cos[c1 == "VE" & c2 != "VE"][order(dist.cos)]
ggplot(knn.cos.ve, aes(x=factor(c2, levels=c2), y=1-dist.cos)) + geom_bar(stat="identity") + coord_flip() +
  xlab("Country") + ylab("Cosine distance") + ggtitle("Most similar countries to Venezuela (cosine distance) (higher is more similar)")

# CL
knn.cos.cl <- knn.cos[c1 == "CL" & c2 != "CL"][order(dist.cos)]
ggplot(knn.cos.cl, aes(x=factor(c2, levels=c2), y=1-dist.cos)) + geom_bar(stat="identity") + coord_flip() +
  xlab("Country") + ylab("Cosine distance") + ggtitle("Most similar countries to Chile (cosine distance) (higher is more similar)")

Get all countries with their most similar counterparts:

one_nn <- knn.cos[dist.cos != 0]
one_nn <- one_nn[one_nn[, .I[dist.cos == min(dist.cos)], by=c1]$V1]
one_nn <- na.omit(one_nn[order(dist.cos)])

one_nn.table <- one_nn[, list(map_country(c1), map_country(c2), dist.cos)]
one_nn.table <- one_nn.table[order(dist.cos)]

kable(one_nn.table[1:30]) #, format="latex", booktabs=T)
V1 V2 dist.cos
Russia Ukraine 0.0079
Ukraine Russia 0.0079
Canada Nigeria 0.0211
Nigeria Canada 0.0211
China Japan 0.0239
Japan China 0.0239
France Italy 0.0259
Italy France 0.0259
Peru Canada 0.0266
Iraq Syria 0.0301
Syria Iraq 0.0301
Afghanistan Cuba 0.0314
Cuba Afghanistan 0.0314
Spain France 0.0320
Mexico Netherlands 0.0338
Netherlands Mexico 0.0338
Kenya Nigeria 0.0355
Puerto Rico Seychelles 0.0362
Seychelles Puerto Rico 0.0362
Iran Iraq 0.0364
Libya Iraq 0.0365
Venezuela Cuba 0.0381
Germany France 0.0384
Australia India 0.0386
India Australia 0.0386
United Arab Emirates South Korea 0.0396
South Korea United Arab Emirates 0.0396
Sudan Libya 0.0404
Antarctica Canada 0.0407
Nepal South Africa 0.0429
knn.cos.pl <- knn.cos[order(dist.cos)]
knn.cos.pl <- knn.cos.pl[dist.cos < .05 & dist.cos > 0]
ggplot(knn.cos.pl, aes(x=reorder(c1, dist.cos), y=reorder(c2, dist.cos),
                       fill="white", label=as.character(round(dist.cos, 2)))) + 
  geom_tile(size=.1, color="white") + fte_theme() +
  geom_text(size=2, color=palette[9]) +
  # scale_x_discrete(labels=map_country) + scale_y_discrete(labels=map_country) +
  theme(axis.text.x=element_text(size=7, angle=45, hjust=1),
        axis.text.y=element_text(size=7)) + 
  labs(x="Country", y="Country")

(Co-)Protagonism (symmetric)

A country is represented as the vector of Jaccard similarities between it and the rest:

\[ c_i = \Big( \frac{\text{events protagonized by } c_i \text{ and } c_j}{\text{events protagonized by } c_i \text{ or } c_j}, c_j \in C \Big)\]

The co-protagonism between two countries is defined as follows:

\[ s(c1, c2) = \frac{|E(c1) \cap E(c2)|}{|E(c1) \cup E(c2)|} \]

Where \(E(c1)\) is the amount of events protagonized by \(c1\).

quantile(prot[, .N, by=country_code]$N, c(.5, .75, .9, .95, .99))
##     50%     75%     90%     95%     99% 
##   18.50   80.25  315.50  631.75 1683.70
prot.events_per_country <- prot[, .N, by=country_code]
summary(prot.events_per_country$N)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     1.00     4.00    18.50   166.40    80.25 10160.00
to_remove <- prot.events_per_country[N < mean(prot.events_per_country$N)]$country_code
prot.f <- prot[!(country_code %in% to_remove)]

countries <- unique(prot.f, by=c("country_code"))[, country_code]
cis <- c()
cjs <- c()
jacs <- c()
for(ci in countries) {
  events_ci <- prot[country_code == ci]$component_id
  for(cj in countries) {
    events_cj <- prot[country_code == cj]$component_id
    
    jacc <- length(intersect(events_ci, events_cj)) / length(union(events_ci, events_cj))
    cis <- c(cis, ci)
    cjs <- c(cjs, cj)
    jacs <- c(jacs, jacc)
  }
}

prot.sim <- data.table(country.x = cis, country.y = cjs, jacc.sim = jacs)
prot.most_similar <- prot.sim[jacc.sim > 0 & jacc.sim < 1][order(-jacc.sim)]
setnames(prot.most_similar, "country.x", "country_code")
tmp <- prot.most_similar %>% left_join(prot.by_country, "country_code")
setnames(tmp, c("country_code", "country.y"), c("country.x", "country_code"))
tmp <- tmp %>% left_join(prot.by_country, "country_code")
setnames(tmp, c("country_code"), c("country.y"))
prot.most_similar <- tmp
prot.most_similar <- prot.most_similar[order(-jacc.sim)]

prot.most_similar.table <- prot.most_similar[1:50][, list(map_country(country.x),
                                                    map_country(country.y),
                                                    N.x, N.y,
                                                    jacc.sim)]
kable(prot.most_similar.table[1:50])#, format="latex", booktabs=T)
V1 V2 N.x N.y jacc.sim
Palestine Israel 360 561 0.2863128
Israel Palestine 561 360 0.2863128
Ukraine Russia 921 823 0.2094313
Russia Ukraine 823 921 0.2094313
United States Great Britain 10162 4015 0.0966120
Great Britain United States 4015 10162 0.0966120
Syria Iraq 647 654 0.0832639
Iraq Syria 654 647 0.0832639
Pakistan India 453 1561 0.0752803
India Pakistan 1561 453 0.0752803
Iran Israel 496 561 0.0698381
Israel Iran 561 496 0.0698381
Japan China 354 646 0.0604454
China Japan 646 354 0.0604454
France Germany 627 371 0.0583245
Germany France 371 627 0.0583245
Great Britain Australia 4015 974 0.0576638
Australia Great Britain 974 4015 0.0576638
Germany Brazil 371 236 0.0574913
Brazil Germany 236 371 0.0574913
Turkey Syria 198 647 0.0536160
Syria Turkey 647 198 0.0536160
Iran Iraq 496 654 0.0511883
Iraq Iran 654 496 0.0511883
Malaysia Australia 262 974 0.0492360
Australia Malaysia 974 262 0.0492360
India Australia 1561 974 0.0475207
Australia India 974 1561 0.0475207
Great Britain Canada 4015 715 0.0443807
Canada Great Britain 715 4015 0.0443807
Libya Egypt 253 316 0.0440367
Egypt Libya 316 253 0.0440367
India Great Britain 1561 4015 0.0436085
Great Britain India 4015 1561 0.0436085
Palestine Egypt 360 316 0.0432099
Egypt Palestine 316 360 0.0432099
Syria Iran 647 496 0.0428832
Iran Syria 496 647 0.0428832
Spain Germany 258 371 0.0413907
Germany Spain 371 258 0.0413907
United States Canada 10162 715 0.0370900
Canada United States 715 10162 0.0370900
Great Britain France 4015 627 0.0363921
France Great Britain 627 4015 0.0363921
Yemen Iran 202 496 0.0356083
Iran Yemen 496 202 0.0356083
India China 1561 646 0.0346929
China India 646 1561 0.0346929
United States India 10162 1561 0.0345041
India United States 1561 10162 0.0345041
ggplot(prot.most_similar, aes(x=reorder(country.x, -jacc.sim), y=reorder(country.y, -jacc.sim),
                        fill=jacc.sim, label=as.character(round(jacc.sim, 2)))) + 
  geom_tile(size=.1) + fte_theme() +
  geom_text(size=2, color=palette[1]) +
  scale_x_discrete(labels=map_country) + scale_y_discrete(labels=map_country) +
  theme(axis.text.x=element_text(size=7, angle=45, hjust=1),
        axis.text.y=element_text(size=7)) + 
  labs(x="Country", y="Country")+ scale_fill_gradient(trans="log")

selected_countries <- c("IL", "PS", "UA", "RU", "GB", "US", "IQ", "SY")

for(country in selected_countries) {
  tmp <- prot.sim[country.x == country][order(-jacc.sim)][jacc.sim != 1][1:10]
  tmp <- tmp[, list(Country=map_country(country.y), Similarity=jacc.sim)]
  t <- kable(tmp)#, format="latex", booktabs=T)
  print(map_country(country))
  print(t)
}
##       IL 
## "Israel" 
## 
## 
## Country          Similarity
## --------------  -----------
## Palestine         0.2863128
## Iran              0.0698381
## Egypt             0.0317647
## Syria             0.0307167
## France            0.0259067
## United States     0.0208492
## Iraq              0.0201511
## Pakistan          0.0201207
## Great Britain     0.0184732
## Ukraine           0.0178571
##          PS 
## "Palestine" 
## 
## 
## Country          Similarity
## --------------  -----------
## Israel            0.2863128
## Egypt             0.0432099
## Nigeria           0.0157895
## Pakistan          0.0149813
## Iraq              0.0129870
## Great Britain     0.0115607
## Ukraine           0.0110497
## Syria             0.0110442
## China             0.0100402
## United States     0.0090142
##        UA 
## "Ukraine" 
## 
## 
## Country          Similarity
## --------------  -----------
## Russia            0.2094313
## United States     0.0249699
## France            0.0238095
## Germany           0.0213439
## Syria             0.0188434
## Great Britain     0.0183619
## Israel            0.0178571
## Malaysia          0.0154506
## Iraq              0.0148196
## Nigeria           0.0144597
##       RU 
## "Russia" 
## 
## 
## Country          Similarity
## --------------  -----------
## Ukraine           0.2094313
## France            0.0305615
## Canada            0.0301407
## Syria             0.0286914
## United States     0.0285581
## Germany           0.0275387
## China             0.0265549
## Great Britain     0.0241321
## Iran              0.0240683
## Turkey            0.0179462
##              GB 
## "Great Britain" 
## 
## 
## Country          Similarity
## --------------  -----------
## United States     0.0966120
## Australia         0.0576638
## Canada            0.0443807
## India             0.0436085
## France            0.0363921
## Syria             0.0311878
## Nigeria           0.0257183
## Russia            0.0241321
## Pakistan          0.0238313
## China             0.0230465
##              US 
## "United States" 
## 
## 
## Country          Similarity
## --------------  -----------
## Great Britain     0.0966120
## Canada            0.0370900
## India             0.0345041
## Australia         0.0319711
## Iraq              0.0288215
## Russia            0.0285581
## Iran              0.0271781
## China             0.0256216
## Syria             0.0252300
## Ukraine           0.0249699
##     IQ 
## "Iraq" 
## 
## 
## Country          Similarity
## --------------  -----------
## Syria             0.0832639
## Iran              0.0511883
## United States     0.0288215
## Libya             0.0283447
## Turkey            0.0228091
## Great Britain     0.0205464
## Israel            0.0201511
## Afghanistan       0.0198330
## Pakistan          0.0193370
## Nigeria           0.0152381
##      SY 
## "Syria" 
## 
## 
## Country          Similarity
## --------------  -----------
## Iraq              0.0832639
## Turkey            0.0536160
## Iran              0.0428832
## Great Britain     0.0311878
## Israel            0.0307167
## Russia            0.0286914
## United States     0.0252300
## Libya             0.0250569
## France            0.0208333
## Ukraine           0.0188434

Interest (relative to max)

The interest of a country is the fraction of the maximum number of tweets it has published in one event.

part.filtered <- part.tidy[tweets > 0]
setkey(part.filtered, component_id)

part.frac <- part.filtered[, list(component_id, tweets, lapply(.SD, function(x) as.numeric(x / max(x)))$tweets), by=country]
setnames(part.frac, "V3", "interest")

Most interest in concentrated below \(0.25\).

p <- ggplot(part.frac, aes(x=interest)) + geom_histogram(binwidth = 0.01) +
  fte_theme() +  geom_hline(yintercept=0, size=0.4, color="black") +
  xlab("Interest") + ylab("Frequency") + scale_y_continuous(labels=comma)

print(p)

ggsave(paste(c(PDF_PATH, "interest-hist.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "interest-hist.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)

These countries have more than one event with interest equal to 1.0, possibly meaning that they have few tweets:

(part.few <- part.frac[interest == 1, which(.N > 1), by=country])
##     country V1
##  1:      DZ  1
##  2:      DM  1
##  3:      CV  1
##  4:      LC  1
##  5:      PW  1
##  6:      TJ  1
##  7:      SH  1
##  8:      SM  1
##  9:      AI  1
## 10:      BV  1
## 11:      CC  1
## 12:      TK  1
## 13:      KY  1
## 14:      HM  1
## 15:      CX  1
(part.few2 <- part.filtered[country %in% part.few$country])
##       component_id country tweets
##    1:            4      DZ      1
##    2:            7      DZ      1
##    3:           11      DM      1
##    4:           12      CV      1
##    5:           17      DM      1
##   ---                            
## 7751:        25473      DZ      1
## 7752:        25474      DZ      8
## 7753:        25476      DZ      3
## 7754:        25477      TJ      1
## 7755:        25478      DZ      1
quantile(part.few2$tweets, c(.5, .75, .9, .99, .999))
##    50%    75%    90%    99%  99.9% 
##  1.000  3.000  6.000 26.000 61.246

Re-filter the table to get rid of countries with very low tweets (90 percentile):

part.filtered <- part.tidy[tweets > 6]
setkey(part.filtered, component_id)
part.frac <- part.filtered[, list(component_id, 
                                  tweets, 
                                  lapply(.SD, function(x) as.numeric(x / max(x)))$tweets), 
                           by=country]
setnames(part.frac, "V3", "interest")
part.frac[, country := factor(country)]

p <- ggplot(part.frac, aes(x=interest)) + geom_histogram(binwidth = 0.01) +
  fte_theme() +  geom_hline(yintercept=0, size=0.4, color="black") +
  xlab("Interest") + ylab("Frequency") + scale_y_continuous(labels=comma)

print(p)

ggsave(paste(c(PDF_PATH, "interest-hist-f.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "interest-hist-f.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)

Distribution of interest

selected_countries <- c("US", "GB", "UA", "RU", "BR", "CL", "SY", "IQ", "IL", "PS")
p <- ggplot(part.frac[country %in% selected_countries], aes(x=country, y=interest)) + 
  geom_boxplot() + fte_theme() + scale_x_discrete(labels=map_country) +
  xlab("") + ylab("Interest")

print(p + ggtitle("Distribution of interest of selected countries"))

ggsave(paste(c(PDF_PATH, "interest-dist.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "interest-dist.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)

Comparison between countries

Re-make the plots of Fig. 6.

Each plot is the average interest of each country (x-axis) to the events protagonized by the two selected countries.

part.wrt.all <- part.frac %>%
  full_join(prot, by=c("component_id")) %>%
  filter(!is.na(country_code)) %>%
  mutate(c.interested=country,
        c.protagonist=country_code) %>%
  select(component_id, c.interested, c.protagonist, interest) %>%
  group_by(c.interested, c.protagonist) %>%
  summarise(avg.interest=mean(interest))

part.wrt.all <- data.table(na.omit(part.wrt.all))

selected_pairs = list(c("RU", "UA"), c("IL", "PS"), c("BR", "DE"), 
                      c("GB", "US"), c("BR", "US"), c("DE", "IL"), 
                      c("BR", "PS"), c("MY", "VN"), c("CN", "MY"),
                      c("BO", "GA"), c("PY", "TJ"), c("FO", "SJ"),
                      c("MY", "UA"), c("AR", "BR"), c("CS", "TM"))

for(pair in selected_pairs) {
  print(paste(map_country(pair), collapse = " & "))
  tmp <- part.wrt.all[c.protagonist %in% pair]
  p <- ggplot(tmp,
              aes(x=reorder(c.interested, avg.interest), 
                  y=avg.interest, 
                  color=factor(c.protagonist, levels=pair),
                  shape=factor(c.protagonist, levels=pair))) + 
    geom_point(size=1, position="jitter", alpha=1) + 
    # geom_smooth(aes(x=reorder(c.interested, avg.interest), y=avg.interest, group=c.protagonist), size=.5) +
    xlab("Interested countries") + 
    ylab("Interest") +
    fte_theme() + theme(axis.text.x = element_blank()) +
    theme(legend.justification=c(0, 1), legend.position=c(0,1),
          legend.background = element_rect(fill="transparent"),
          legend.key.height=unit(1.8,"line"),
          axis.text=element_text(size=6)) +
    scale_y_continuous(breaks=seq(0, 1, by=0.1), limits=c(0, 1.1)) +
    scale_color_discrete(name="Protagonist Country",
                         breaks=pair,
                         labels=map_country(pair)) +
    scale_shape_discrete(name="Protagonist Country",
                         breaks=pair,
                         solid=F,
                         labels=map_country(pair)) 
    # scale_x_discrete(labels=map_country)
  
  print(p + ggtitle(paste(map_country(pair), collapse = " & ")))

  ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".eps"), collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".pdf"), collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  
}
## [1] "Russia & Ukraine"

## [1] "Israel & Palestine"

## [1] "Brazil & Germany"

## [1] "Great Britain & United States"

## [1] "Brazil & United States"

## [1] "Germany & Israel"

## [1] "Brazil & Palestine"

## [1] "Malaysia & Vietnam"

## [1] "China & Malaysia"

## [1] "Bolivia & Gabon"

## [1] "Paraguay & Tajikistan"

## [1] "Faroe Islands & Svalbard and Jan Mayen"

## [1] "Malaysia & Ukraine"

## [1] "Argentina & Brazil"

## [1] "Serbia & Turkmenistan"

Export vectors to files:

for(c in countries) {
  x = part.wrt.all[c.protagonist == c]
  f = paste(c("~/galean/scripts/queries/data/interest_vectors/", c, ".txt"), collapse = "")
  
  write.table(x, f, row.names = F, sep = "\t")
}

Similar countries

knn.int.cos <- data.table(read.table('~/galean/scripts/queries/data/knn_int_cos.txt', 
                                     header=F, sep='\t', 
                                     stringsAsFactors = T, na.strings = ''))
setnames(knn.int.cos, c("V1", "V2", "V3"), c("c1", "c2", "dist.cos"))

one_nn <- knn.int.cos[dist.cos != 1]
one_nn <- one_nn[one_nn[, .I[dist.cos == max(dist.cos)], by=c1]$V1]
one_nn <- na.omit(one_nn[order(-dist.cos)])

one_nn.table <- one_nn[, list(map_country(c1), map_country(c2), dist.cos)]
one_nn.table <- one_nn.table[order(-dist.cos)]

kable(one_nn.table[1:30]) #, format="latex", booktabs=T)
V1 V2 dist.cos
Solomon Islands Hungary 0.9999
Solomon Islands Dominica 0.9999
Solomon Islands Bulgaria 0.9999
Solomon Islands Faroe Islands 0.9999
Solomon Islands Puerto Rico 0.9999
Solomon Islands Estonia 0.9999
Solomon Islands Republic of the Congo 0.9999
Solomon Islands Svalbard and Jan Mayen 0.9999
Solomon Islands Ivory Coast 0.9999
Solomon Islands Montenegro 0.9999
Gabon NA 0.9977
Bolivia NA 0.9976
Bhutan Montenegro 0.9975
Bhutan Maldives 0.9975
Fiji Syria 0.9945
Gambia Jordan 0.9917
Togo Dominica 0.9892
Poland Afghanistan 0.9869
Vanuatu Haiti 0.9825
Lesotho Slovakia 0.9799
Cook Islands Belize 0.9773
Mongolia Belize 0.9773
Greenland NA 0.9769
Cocos [Keeling] Islands NA 0.9758
Ecuador Lithuania 0.9752
East Timor Haiti 0.9750
Burundi Belgium 0.9745
Somalia Finland 0.9704
Macedonia New Zealand 0.9675
Sierra Leone Nigeria 0.9673
knn.int.cos.pl <- knn.int.cos[order(dist.cos)]
knn.int.cos.pl <- knn.int.cos.pl[dist.cos < .5 & dist.cos > 0]
ggplot(knn.int.cos.pl, aes(x=reorder(c1, dist.cos), y=reorder(c2, dist.cos),
                       fill="white", label=as.character(round(dist.cos, 2)))) + 
  geom_tile(size=.1, color="white") + fte_theme() +
  geom_text(size=2, color=palette[9]) +
  scale_x_discrete(labels=map_country) + scale_y_discrete(labels=map_country) +
  theme(axis.text.x=element_text(size=7, angle=45, hjust=1),
        axis.text.y=element_text(size=7)) + 
  labs(x="Country", y="Country")

Events with most interest

quantile(part.frac[, interest], c(.5, .75, .8, .9, .99))
##         50%         75%         80%         90%         99% 
## 0.008673027 0.026315789 0.034852547 0.070422535 0.319908521
part.median_interest <- part.frac[interest > median(interest)]

part.countries_by_event <- part.median_interest[, max(.N), by=component_id]

Events with maximum number of countries interested (and with interest > median interest):

kable(part.countries_by_event[order(-V1)][1:10],
      col.names=c("component_id", "no of countries"))
component_id no of countries
15945 180
14355 160
18989 160
13187 155
2878 154
24460 154
12321 153
298 151
11432 150
14577 150

Even with no filtering (interest > 0), the resulting list is unchanged. That means, the events with most countries interested produced high interest from every country (at least in top 50%).

Interest (relative to sum)

Measure the interest of a country as the percentage of all tweets issued from that country.

part.filtered <- part.tidy[tweets > 0]
setkey(part.filtered, component_id)

part.frac.all <- part.filtered[, list(component_id, tweets, lapply(.SD, function(x) as.numeric(x / sum(x)))$tweets), by=country]
setnames(part.frac.all, "V3", "interest")

Distribution of interest

Countries with largest IQR of interest:

kable(part.frac.all[, IQR(interest), by=country][order(-V1)][1:30])
country V1
MC 0.0033784
NU 0.0033333
GS 0.0021097
VA 0.0020270
MO 0.0018904
IO 0.0017123
SZ 0.0014286
SR 0.0013280
SL 0.0010718
KG 0.0009416
AG 0.0007955
BN 0.0007794
CR 0.0007452
CG 0.0006998
ER 0.0006840
GW 0.0006566
MP 0.0006532
LS 0.0006184
SV 0.0005797
GU 0.0005376
UZ 0.0005280
VI 0.0005198
GA 0.0004647
LR 0.0004505
LI 0.0004446
MN 0.0004286
FJ 0.0004186
LU 0.0003748
MU 0.0003733
MQ 0.0003546
selected_countries <- c("US", "GB", "UA", "RU", "BR", "CL", "SY", "IQ", "IL", "PS")
p <- ggplot(part.frac.all[country %in% selected_countries], aes(x=country, y=interest)) + 
  geom_boxplot() + fte_theme() + scale_x_discrete(labels=map_country) +
  xlab("") + ylab("Interest")

print(p + ggtitle("Distribution of interest of selected countries"))

ggsave(paste(c(PDF_PATH, "interest-all-dist.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "interest-all-dist.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)

Comparison between countries

part.wrt.all <- part.frac.all %>%
  full_join(prot, by=c("component_id")) %>%
  filter(!is.na(country_code)) %>%
  mutate(c.interested=country,
        c.protagonist=country_code) %>%
  select(component_id, c.interested, c.protagonist, interest) %>%
  group_by(c.interested, c.protagonist) %>%
  summarise(avg.interest=mean(interest))

part.wrt.all <- data.table(na.omit(part.wrt.all))

selected_pairs = list(c("RU", "UA"), c("IL", "PS"), c("BR", "DE"), 
                      c("GB", "US"), c("BR", "US"), c("DE", "IL"), 
                      c("BR", "PS"), c("MY", "VN"), c("CN", "MY"),
                      c("BO", "GA"), c("PY", "TJ"), c("FO", "SJ"),
                      c("MY", "UA"), c("AR", "BR"), c("CS", "TM"))

for(pair in selected_pairs) {
  print(paste(map_country(pair), collapse = " & "))
  tmp <- part.wrt.all[c.protagonist %in% pair]
  p <- ggplot(tmp,
              aes(x=reorder(c.interested, avg.interest), 
                  y=avg.interest, 
                  color=factor(c.protagonist, levels=pair),
                  shape=factor(c.protagonist, levels=pair))) + 
    geom_point(size=1, position="jitter", alpha=1) + 
    # geom_smooth(aes(x=reorder(c.interested, avg.interest), y=avg.interest, group=c.protagonist), size=.5) +
    xlab("Interested countries") + 
    ylab("Interest") +
    fte_theme() + theme(axis.text.x = element_blank()) +
    theme(legend.justification=c(0, 1), legend.position=c(0,1),
          legend.background = element_rect(fill="transparent"),
          legend.key.height=unit(1.8,"line"),
          axis.text=element_text(size=6)) +
    scale_y_continuous(breaks=seq(0, 1, by=0.1), limits=c(0, 1.1)) +
    scale_color_discrete(name="Protagonist Country",
                         breaks=pair,
                         labels=map_country(pair)) +
    scale_shape_discrete(name="Protagonist Country",
                         breaks=pair,
                         solid=F,
                         labels=map_country(pair)) 
    # scale_x_discrete(labels=map_country)
  
  print(p + ggtitle(paste(map_country(pair), collapse = " & ")))

  ggsave(paste(c(PDF_PATH, paste(c("int-prot-all", pair), collapse = "-"), ".eps"), 
               collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  ggsave(paste(c(PDF_PATH, paste(c("int-prot-all", pair), collapse = "-"), ".pdf"), 
               collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  
}
## [1] "Russia & Ukraine"

## [1] "Israel & Palestine"

## [1] "Brazil & Germany"

## [1] "Great Britain & United States"

## [1] "Brazil & United States"

## [1] "Germany & Israel"

## [1] "Brazil & Palestine"

## [1] "Malaysia & Vietnam"

## [1] "China & Malaysia"

## [1] "Bolivia & Gabon"

## [1] "Paraguay & Tajikistan"

## [1] "Faroe Islands & Svalbard and Jan Mayen"

## [1] "Malaysia & Ukraine"

## [1] "Argentina & Brazil"

## [1] "Serbia & Turkmenistan"

Export vectors to files:

countries <- unique(part.wrt.all$c.protagonist)
for(c in countries) {
  x = part.wrt.all[c.protagonist == c]
  f = paste(c("~/galean/scripts/queries/data/interest_vectors/", c, ".txt"), collapse = "")
  
  write.table(x, f, row.names = F, sep = "\t")
}

 Similar countries

knn.int.cos <- data.table(read.table('~/galean/scripts/queries/data/knn_int_cos2.txt', 
                                     header=F, sep='\t', 
                                     stringsAsFactors = T, na.strings = ''))
setnames(knn.int.cos, c("V1", "V2", "V3"), c("c1", "c2", "dist.cos"))

one_nn <- knn.int.cos[dist.cos != 1]
one_nn <- one_nn[one_nn[, .I[dist.cos == max(dist.cos)], by=c1]$V1]
one_nn <- na.omit(one_nn[order(-dist.cos)])

one_nn.table <- one_nn[, list(map_country(c1), map_country(c2), dist.cos)]
one_nn.table <- one_nn.table[order(-dist.cos)]

kable(one_nn.table[1:30]) #, format="latex", booktabs=T)
V1 V2 dist.cos
Cuba United States 0.9995
United States Cuba 0.9995
Costa Rica Uruguay 0.9994
Uruguay Costa Rica 0.9994
Argentina Brazil 0.9993
Brazil Argentina 0.9993
Bosnia and Herzegovina Honduras 0.9992
Honduras Bosnia and Herzegovina 0.9992
Honduras Croatia 0.9992
Croatia Costa Rica 0.9992
Croatia Honduras 0.9992
Denmark Russia 0.9991
Spain Argentina 0.9991
Iraq North Korea 0.9991
North Korea Iraq 0.9991
Russia Denmark 0.9991
Libya Cuba 0.9990
Mexico Peru 0.9990
Peru Mexico 0.9990
Taiwan Denmark 0.9989
Vietnam Denmark 0.9989
Botswana United States 0.9988
Israel Iraq 0.9988
Aruba Libya 0.9987
France Italy 0.9987
Guyana Papua 0.9987
Italy France 0.9987
Papua Guyana 0.9987
Democratic Republic of the Congo Colombia 0.9986
Colombia Democratic Republic of the Congo 0.9986
ggplot(one_nn[1:30], aes(x=reorder(c1, -dist.cos), y=reorder(c2, -dist.cos),
                       fill=dist.cos, label=as.character(round(dist.cos, 2)))) + 
  geom_tile(size=.1) + fte_theme() +
  geom_text(size=2, color=palette[1]) +
  scale_x_discrete(labels=map_country) + scale_y_discrete(labels=map_country) +
  theme(axis.text.x=element_text(size=7, angle=45, hjust=1),
        axis.text.y=element_text(size=7)) + 
  labs(x="Country", y="Country")

part.wrt.all <- part.frac.all %>%
  full_join(prot, by=c("component_id")) %>%
  filter(!is.na(country_code)) %>%
  mutate(c.interested=country,
        c.protagonist=country_code) %>%
  select(component_id, c.interested, c.protagonist, interest) %>%
  group_by(c.interested, c.protagonist) %>%
  summarise(avg.interest=mean(interest))

part.wrt.all <- data.table(na.omit(part.wrt.all))

selected_pairs = list(c("FO", "SJ"), c("MY", "UA"), c("FJ", "VU"), 
                      c("RS", "TM"), c("CN", "DE"), c("PY", "TJ"), 
                      c("NL", "MY"), c("BO", "TL"), c("GA", "TL"),
                      c("AU", "DE"), c("PL", "TR"), c("IN", "AU"),
                      c("AT", "IQ"), c("KE", "YE"), c("AM", "VA"))

for(pair in selected_pairs) {
  print(paste(map_country(pair), collapse = " & "))
  tmp <- part.wrt.all[c.protagonist %in% pair]
  p <- ggplot(tmp,
              aes(x=reorder(c.interested, avg.interest), 
                  y=avg.interest, 
                  color=factor(c.protagonist, levels=pair),
                  shape=factor(c.protagonist, levels=pair))) + 
    geom_point(size=1, position="jitter", alpha=1) + 
    # geom_smooth(aes(x=reorder(c.interested, avg.interest), y=avg.interest, group=c.protagonist), size=.5) +
    xlab("Interested countries") + 
    ylab("Interest") +
    fte_theme() + theme(axis.text.x = element_blank()) +
    theme(legend.justification=c(0, 1), legend.position=c(0,1),
          legend.background = element_rect(fill="transparent"),
          legend.key.height=unit(1.8,"line"),
          axis.text=element_text(size=6)) +
    scale_y_continuous(breaks=seq(0, 1, by=0.1), limits=c(0, 1.1)) +
    scale_color_discrete(name="Protagonist Country",
                         breaks=pair,
                         labels=map_country(pair)) +
    scale_shape_discrete(name="Protagonist Country",
                         breaks=pair,
                         solid=F,
                         labels=map_country(pair)) 
    # scale_x_discrete(labels=map_country)
  
  print(p + ggtitle(paste(map_country(pair), collapse = " & ")))

  ggsave(paste(c(PDF_PATH, paste(c("int-prot-all", pair), collapse = "-"), ".eps"), 
               collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  ggsave(paste(c(PDF_PATH, paste(c("int-prot-all", pair), collapse = "-"), ".pdf"), 
               collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  
}
## [1] "Faroe Islands & Svalbard and Jan Mayen"

## [1] "Malaysia & Ukraine"

## [1] "Fiji & Vanuatu"

## [1] "Serbia & Turkmenistan"

## [1] "China & Germany"

## [1] "Paraguay & Tajikistan"

## [1] "Netherlands & Malaysia"

## [1] "Bolivia & East Timor"

## [1] "Gabon & East Timor"

## [1] "Australia & Germany"

## [1] "Poland & Turkey"

## [1] "India & Australia"

## [1] "Austria & Iraq"

## [1] "Kenya & Yemen"

## [1] "Armenia & Vatican City"

Events with most interest

summary(part.frac.all[, interest])
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000001 0.0000181 0.0000542 0.0002377 0.0001670 0.5000000
quantile(part.frac.all[, interest], c(.5, .75, .8, .9, .99))
##          50%          75%          80%          90%          99% 
## 5.424464e-05 1.669728e-04 2.135535e-04 4.446421e-04 2.701790e-03
part.median_interest <- part.frac.all[interest > 2.701790e-03]

part.countries_by_event <- part.median_interest[, max(.N), by=component_id]

Events with maximum number of countries interested (and with interest > median interest):

kable(part.countries_by_event[order(-V1)][1:10],
      col.names=c("component_id", "no of countries"))
component_id no of countries
15945 202
298 153
14355 144
12321 143
18989 130
2878 127
13187 106
17406 102
19427 97
24460 97
events <- c("Death of actor Robin Williams.",
            "2014 FIFA World Cup final between Germany and Argentina.",
            "2014 FIFA World Cup starts.",
            "2015 Super Bowl starts.",
            "New Year's Eve 2013",
            "Soccer Player Luis Suarez is suspended from 2014 World Cup.",
            "Charlie Hebdo shooting in Paris.",
            "2015 Grammy Awards.",
            "Professional boxing match between Floyd Mayweather and Manny Pacquiao.")
dates <- c("2014-08-12",
           "2014-07-13",
           "2014-06-12",
           "2015-02-02",
           "2013-12-31",
           "2014-06-26",
           "2015-01-07",
           "2015-02-09",
           "2015-05-03")
countries_affected <- c(202, 144, 143, 130, 127, 106, 102, 97, 97)

hi_events <- data.table(Description=events, Date=dates, Countries=countries_affected)

kable(hi_events, format="latex", booktabs=T)
\begin{tabular}{llr} \toprule Description & Date & Countries\\ \midrule Death of actor Robin Williams. & 2014-08-12 & 202\\ 2014 FIFA World Cup final between Germany and Argentina. & 2014-07-13 & 144\\ 2014 FIFA World Cup starts. & 2014-06-12 & 143\\ 2015 Super Bowl starts. & 2015-02-02 & 130\\ New Year's Eve 2013 & 2013-12-31 & 127\\ \addlinespace Soccer Player Luis Suarez is suspended from 2014 World Cup. & 2014-06-26 & 106\\ Charlie Hebdo shooting in Paris. & 2015-01-07 & 102\\ 2015 Grammy Awards. & 2015-02-09 & 97\\ Professional boxing match between Floyd Mayweather and Manny Pacquiao. & 2015-05-03 & 97\\ \bottomrule \end{tabular}

Self-interest

part.wrt.all <- part.frac.all %>%
  full_join(prot, by=c("component_id")) %>%
  filter(!is.na(country_code)) %>%
  mutate(c.interested=country,
        c.protagonist=country_code) %>%
  select(component_id, c.interested, c.protagonist, interest)

part.wrt.all <- na.omit(data.table(part.wrt.all))

cs <- c()
si <- c()
for(cnt in countries_list$country.code) {
  cs <- c(cs, cnt)
  si <- c(si, part.wrt.all[c.protagonist == cnt & c.interested == cnt][, sum(interest)])
}

part.self <- data.table(c.interested=cs, self.interest=si)
# part.self <- part.self[order(-self.interest)][, list(country=map_country(country),
#                                                      self.interest)]
part.self <- part.wrt.all[, .N, by=c.interested] %>% left_join(part.self, "c.interested")
part.self <- part.self[order(-self.interest)]

p <- ggplot(part.self[1:15], aes(x=reorder(c.interested, self.interest), 
                      y=self.interest, 
                      label=comma(N))) +
  geom_bar(stat="identity") + coord_flip() + geom_text(size=2.5, hjust=-.2, color=palette[6]) +
  scale_y_continuous(labels=comma, breaks=seq(0, 1, by=0.1), limits = c(0, 1)) +
  scale_x_discrete(labels=map_country) +
  fte_theme() +  geom_hline(yintercept=0, size=0.4, color="black") +
  xlab("Country") + ylab("Interest") 

print(p + ggtitle("Interest of countries in events protagonized by them"))

ggsave(paste(c(PDF_PATH, "self-interest.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "self-interest.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)

Interest (relative to mean)

part.tidy2 <- part.tidy[tweets > 0]
part.wrt.all <- part.tidy2 %>% 
  full_join(prot, by="component_id") %>%
  mutate(c.interested = country, c.protagonist = country_code) %>%
  select(component_id, c.protagonist, c.interested, tweets) %>%
  group_by(c.interested, c.protagonist) %>%
  summarise(events=length(component_id))
            

(part.wrt.all <- data.table(na.omit(part.wrt.all)))
##        c.interested c.protagonist events
##     1:           AD            AE      4
##     2:           AD            AF      5
##     3:           AD            AR      4
##     4:           AD            AU     22
##     5:           AD            BB      1
##    ---                                  
## 31679:           ZW            XK      1
## 31680:           ZW            YE     68
## 31681:           ZW            ZA    124
## 31682:           ZW            ZM      1
## 31683:           ZW            ZW     27
# part.wrt.all <- part.wrt.all[, .SD[sum(events) > 50], 
                             # by=c.interested][, .SD[sum(events) > 50], 
                                              # by=c.protagonist]

part.wrt.all[, sc.events:=scale(events), by=c.interested]
# tmp2[, sc.events:=scale(events), by=c.interested]

selected_pairs = list(c("UA", "RU"), c("IL", "PS"), c("BR", "DE"), 
                      c("US", "GB"))

for(pair in selected_pairs) {
  print(paste(map_country(pair), collapse = " & "))
  tmp <- part.wrt.all[c.protagonist %in% pair]
  p <- ggplot(tmp,
              aes(x=reorder(c.interested, sc.events), 
                  y=sc.events, 
                  color=factor(c.protagonist, levels=pair),
                  shape=factor(c.protagonist, levels=pair))) + 
    geom_point(size=1, position="jitter", alpha=1) + 
    geom_smooth(aes(group=c.protagonist)) +
    # geom_line(aes(group=c.protagonist)) +
    # geom_smooth(aes(x=reorder(c.interested, avg.interest), y=avg.interest, group=c.protagonist), size=.5) +
    xlab("Interested countries") + 
    ylab("Interest") +
    fte_theme() + theme(axis.text.x = element_blank()) +
    theme(legend.justification=c(0, 1), legend.position=c(0,1),
          legend.background = element_rect(fill="transparent"),
          legend.key.height=unit(1.8,"line"),
          axis.text=element_text(size=6)) +
    # scale_y_continuous(breaks=seq(0, 1, by=0.1), limits=c(0, 1.1)) +
    scale_color_discrete(name="Protagonist Country",
                         breaks=pair,
                         labels=map_country(pair)) +
    scale_shape_discrete(name="Protagonist Country",
                         breaks=pair,
                         solid=F,
                         labels=map_country(pair)) 
    # scale_x_discrete(labels=map_country)
  
  print(p + ggtitle(paste(map_country(pair), collapse = " & ")))

  ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".eps"), collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".pdf"), collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  
}
## [1] "Ukraine & Russia"
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

## [1] "Israel & Palestine"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "Brazil & Germany"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "United States & Great Britain"
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

Export interest vectors

countries <- unique(part.wrt.all$c.protagonist)
for(c in countries) {
  x = part.wrt.all[c.protagonist == c]
  f = paste(c("~/galean/scripts/queries/data/interest_vectors/", c, ".txt"), collapse = "")
  
  write.table(x, f, row.names = F, sep = "\t")
}

 Cosine similarity

knn.int.cos <- data.table(read.table('~/galean/scripts/queries/data/knn_int_cos_scale.txt', 
                                     header=F, sep='\t', 
                                     stringsAsFactors = T, na.strings = ''))
setnames(knn.int.cos, c("V1", "V2", "V3"), c("c1", "c2", "dist.cos"))

one_nn <- knn.int.cos[dist.cos != 1]
one_nn <- one_nn[one_nn[, .I[dist.cos == max(dist.cos)], by=c1]$V1]
one_nn <- na.omit(one_nn[order(-dist.cos)])

one_nn.table <- one_nn[, list(map_country(c1), map_country(c2), dist.cos)]
one_nn.table <- one_nn.table[order(-dist.cos)]

kable(one_nn.table[1:30]) #, format="latex", booktabs=T)
V1 V2 dist.cos
Australia Great Britain 0.9828
Great Britain Australia 0.9828
United States Great Britain 0.9803
Algeria Sri Lanka 0.9766
Sri Lanka Algeria 0.9766
Russia Ukraine 0.9728
Ukraine Russia 0.9728
Canada United States 0.9721
Bosnia and Herzegovina Honduras 0.9714
Honduras Bosnia and Herzegovina 0.9714
Croatia Bosnia and Herzegovina 0.9713
France Great Britain 0.9708
Taiwan Bosnia and Herzegovina 0.9705
Niger Bosnia and Herzegovina 0.9686
Finland Croatia 0.9684
Costa Rica Sweden 0.9671
Sweden Costa Rica 0.9671
Estonia Kazakhstan 0.9667
Kazakhstan Estonia 0.9667
Denmark Croatia 0.9665
Uruguay Taiwan 0.9660
Zimbabwe Croatia 0.9650
Ivory Coast Croatia 0.9638
Ghana Taiwan 0.9623
China Australia 0.9608
Singapore Taiwan 0.9606
Chile Taiwan 0.9602
Cameroon Niger 0.9564
Norway Croatia 0.9563
Guinea Mali 0.9561
knn.int.cos[c1 %in% c("BR", "DE", "UA", "RU", "IL") & c2 %in% c("BR", "DE", "UA", "RU", "PS")][dist.cos != 1][order(-dist.cos)]
##     c1 c2 dist.cos
##  1: RU UA   0.9728
##  2: UA RU   0.9728
##  3: DE RU   0.9256
##  4: RU DE   0.9256
##  5: IL RU   0.8964
##  6: IL UA   0.8879
##  7: DE UA   0.8733
##  8: UA DE   0.8733
##  9: IL PS   0.8650
## 10: IL DE   0.8543
## 11: BR DE   0.7663
## 12: DE BR   0.7663
## 13: UA PS   0.7189
## 14: RU PS   0.6948
## 15: DE PS   0.6401
## 16: BR RU   0.6239
## 17: RU BR   0.6239
## 18: IL BR   0.5985
## 19: BR UA   0.5605
## 20: UA BR   0.5605
## 21: BR PS   0.3527
##     c1 c2 dist.cos
knn.int.cos.pl <- knn.int.cos[order(-dist.cos)]
knn.int.cos.pl <- knn.int.cos.pl[dist.cos > .95 & dist.cos < 1]
ggplot(knn.int.cos.pl, aes(x=reorder(c1, -dist.cos), y=reorder(c2, -dist.cos),
                       fill="white", label=as.character(round(dist.cos, 2)))) + 
  geom_tile(size=.1, color="white") + fte_theme() +
  geom_text(size=2, color=palette[9]) +
  scale_x_discrete(labels=map_country) + scale_y_discrete(labels=map_country) +
  theme(axis.text.x=element_text(size=7, angle=45, hjust=1),
        axis.text.y=element_text(size=7)) + 
  labs(x="Country", y="Country")

selected_pairs = list(c("AU", "GB"), c("US", "GB"), c("DZ", "LK"), 
                      c("RU", "UA"), c("CA", "US"))

for(pair in selected_pairs) {
  print(paste(map_country(pair), collapse = " & "))
  tmp <- part.wrt.all[c.protagonist %in% pair]
  p <- ggplot(tmp,
              aes(x=reorder(c.interested, sc.events), 
                  y=sc.events, 
                  color=factor(c.protagonist, levels=pair),
                  shape=factor(c.protagonist, levels=pair))) + 
    geom_point(size=1, position="jitter", alpha=1) + 
    geom_smooth(aes(group=c.protagonist), size=.5) +
    # geom_line(aes(group=c.protagonist)) +
    # geom_smooth(aes(x=reorder(c.interested, avg.interest), y=avg.interest, group=c.protagonist), size=.5) +
    xlab("Interested countries") + 
    ylab("Interest") +
    fte_theme() + theme(axis.text.x = element_blank()) +
    theme(legend.justification=c(0, 1), legend.position=c(0,1),
          legend.background = element_rect(fill="transparent"),
          legend.key.height=unit(1.8,"line"),
          axis.text=element_text(size=6)) +
    # scale_y_continuous(breaks=seq(0, 1, by=0.1), limits=c(0, 1.1)) +
    scale_color_discrete(name="Protagonist Country",
                         breaks=pair,
                         labels=map_country(pair)) +
    scale_shape_discrete(name="Protagonist Country",
                         breaks=pair,
                         solid=F,
                         labels=map_country(pair)) 
    # scale_x_discrete(labels=map_country)
  
  print(p + ggtitle(paste(map_country(pair), collapse = " & ")))

  ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".eps"), collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".pdf"), collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  
}
## [1] "Australia & Great Britain"
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

## [1] "United States & Great Britain"
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

## [1] "Algeria & Sri Lanka"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "Russia & Ukraine"
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

## [1] "Canada & United States"
## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

Euclidean distance

knn.int.euc <- data.table(read.table('~/galean/scripts/queries/data/knn_int_euc_scale.txt', 
                                     header=F, sep='\t', 
                                     stringsAsFactors = T, na.strings = ''))
setnames(knn.int.euc, c("V1", "V2", "V3"), c("c1", "c2", "dist.euc"))

one_nn <- knn.int.euc[dist.euc != 0]
one_nn <- one_nn[one_nn[, .I[dist.euc == min(dist.euc)], by=c1]$V1]
one_nn <- na.omit(one_nn[order(dist.euc)])

setnames(one_nn, c("c1"), c("country_code"))
tmp <- one_nn %>% left_join(prot.by_country, "country_code")
setnames(tmp, c("country_code", "c2"), c("c1", "country_code"))
tmp <- tmp %>% left_join(prot.by_country, "country_code")
setnames(tmp, c("country_code"), c("c2"))

tmp <- tmp[, list(c1 = map_country(c1), c2 = map_country(c2), N.x, N.y, dist.euc)]
tmp <- tmp[order(dist.euc / (N.x * N.y))]

kable(tmp[N.x >= 166.41 & N.y >= 166.41][order(dist.euc)])#, format="latex", booktabs=T)
c1 c2 N.x N.y dist.euc
Turkey Indonesia 198 172 1.1442
Yemen Turkey 202 198 1.3416
Afghanistan Turkey 323 198 1.5304
Libya Turkey 253 198 1.6050
Palestine Egypt 360 316 1.6496
Egypt Palestine 316 360 1.6496
Malaysia Turkey 262 198 1.8096
Japan Spain 354 258 1.8327
Spain Japan 258 354 1.8327
Italy Japan 315 354 1.9018
Brazil Spain 236 258 1.9060
Pakistan Germany 453 371 2.0674
Germany Pakistan 371 453 2.0674
Syria Israel 647 561 2.4463
Israel Syria 561 647 2.4463
Ukraine Russia 921 823 2.5557
Russia Ukraine 823 921 2.5557
Nigeria Pakistan 412 453 2.5822
China Canada 646 715 2.6025
Canada China 715 646 2.6025
Iran Syria 496 647 2.6838
Iraq Iran 654 496 2.9270
France Canada 627 715 3.7859
Australia France 974 627 4.1398
India Australia 1561 974 4.8339
Great Britain India 4015 1561 41.7719
United States Great Britain 10162 4015 97.2733
# one_nn.table <- one_nn[, list(map_country(c1), map_country(c2), dist.euc)]
# one_nn.table <- one_nn.table[order(dist.euc)]
# 
# kable(one_nn.table[1:30]) #, format="latex", booktabs=T)

knn.int.euc[c1 %in% c("BR", "DE", "UA", "RU", "IL") & c2 %in% c("BR", "DE", "UA", "RU", "PS")][dist.euc != 0][order(dist.euc)]
##     c1 c2 dist.euc
##  1: RU UA   2.5557
##  2: UA RU   2.5557
##  3: BR DE   2.8704
##  4: DE BR   2.8704
##  5: BR PS   3.2902
##  6: DE PS   3.3507
##  7: IL DE   4.1893
##  8: IL RU   4.2992
##  9: IL PS   4.8173
## 10: IL UA   5.2256
## 11: DE RU   5.5783
## 12: RU DE   5.5783
## 13: IL BR   6.0385
## 14: DE UA   6.9912
## 15: UA DE   6.9912
## 16: RU PS   7.5344
## 17: BR RU   7.9699
## 18: RU BR   7.9699
## 19: UA PS   8.5056
## 20: BR UA   9.2345
## 21: UA BR   9.2345
##     c1 c2 dist.euc
knn.int.cos.pl <- knn.int.cos[order(dist.cos)]
knn.int.cos.pl <- knn.int.cos.pl[dist.cos < .5 & dist.cos > 0]
ggplot(knn.int.cos.pl, aes(x=reorder(c1, dist.cos), y=reorder(c2, dist.cos),
                       fill="white", label=as.character(round(dist.cos, 2)))) + 
  geom_tile(size=.1, color="white") + fte_theme() +
  geom_text(size=2, color=palette[9]) +
  scale_x_discrete(labels=map_country) + scale_y_discrete(labels=map_country) +
  theme(axis.text.x=element_text(size=7, angle=45, hjust=1),
        axis.text.y=element_text(size=7)) + 
  labs(x="Country", y="Country")

selected_pairs = list(c("AF", "TR"), c("PK", "IR"), c("EG", "AF"), 
                      c("JP", "IR"), c("MY", "PK"), c("BR", "DE"),
                      c("US", "GB"), c("IL", "PS"), c("UA", "RU"),
                      c("KP", "KR"), c("ID", "TR"), c("YE", "TR"))

for(pair in selected_pairs) {
  print(paste(map_country(pair), collapse = " & "))
  tmp <- part.wrt.all[c.protagonist %in% pair]
  p <- ggplot(tmp,
              aes(x=reorder(c.interested, sc.events), 
                  y=sc.events, 
                  color=factor(c.protagonist, levels=pair),
                  shape=factor(c.protagonist, levels=pair))) + 
    geom_point(size=1, position="jitter", alpha=1) + 
    geom_smooth(aes(group=c.protagonist), size=.5) +
    # geom_line(aes(group=c.protagonist)) +
    # geom_smooth(aes(x=reorder(c.interested, avg.interest), y=avg.interest, group=c.protagonist), size=.5) +
    xlab("Interested countries") + 
    ylab("Standard deviations from the mean") +
    fte_theme() + theme(axis.text.x = element_blank()) +
    theme(legend.justification=c(0, 1), legend.position=c(0,1),
          legend.background = element_rect(fill="transparent"),
          legend.key.height=unit(1.8,"line"),
          axis.text=element_text(size=6)) +
    scale_y_continuous(breaks=seq(-.5, 2, by=0.25), limits=c(-.5, 2)) +
    scale_color_discrete(name="Protagonist Country",
                         breaks=pair,
                         labels=map_country(pair)) +
    scale_shape_discrete(name="Protagonist Country",
                         breaks=pair,
                         solid=F,
                         labels=map_country(pair)) 
    # scale_x_discrete(labels=map_country)
  
  print(p + ggtitle(paste(map_country(pair), collapse = " & ")))

  ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".eps"), collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".pdf"), collapse = ""), 
         p, dpi=pl.DPI, width=pl.W-2, height=pl.H)
  
}
## [1] "Afghanistan & Turkey"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "Pakistan & Iran"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "Egypt & Afghanistan"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "Japan & Iran"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "Malaysia & Pakistan"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "Brazil & Germany"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "United States & Great Britain"
## Warning: Removed 446 rows containing non-finite values (stat_smooth).
## Warning: Removed 447 rows containing missing values (geom_point).
## Warning: Removed 446 rows containing non-finite values (stat_smooth).
## Warning: Removed 446 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## Warning: Removed 446 rows containing non-finite values (stat_smooth).
## Warning: Removed 446 rows containing missing values (geom_point).

## [1] "Israel & Palestine"
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

## [1] "Ukraine & Russia"
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 5 rows containing missing values (geom_point).
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 5 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 5 rows containing missing values (geom_point).

## [1] "North Korea & South Korea"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "Indonesia & Turkey"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "Yemen & Turkey"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

Norm 1 (absolute difference)

knn.int.abs <- data.table(read.table('~/galean/scripts/queries/data/knn_int_abs_scale.txt', 
                                     header=F, sep='\t', 
                                     stringsAsFactors = T, na.strings = ''))
setnames(knn.int.abs, c("V1", "V2", "V3"), c("c1", "c2", "dist.abs"))

one_nn <- knn.int.abs[dist.abs != 0]
one_nn <- one_nn[one_nn[, .I[dist.abs == min(dist.abs)], by=c1]$V1]
one_nn <- na.omit(one_nn[order(dist.abs)])

setnames(one_nn, c("c1"), c("country_code"))
tmp <- one_nn %>% left_join(prot.by_country, "country_code")
setnames(tmp, c("country_code", "c2"), c("c1", "country_code"))
tmp <- tmp %>% left_join(prot.by_country, "country_code")
setnames(tmp, c("country_code"), c("c2"))

tmp <- tmp[, list(c1 = map_country(c1), c2 = map_country(c2), N.x, N.y, dist.abs)]
tmp <- tmp[order(dist.abs / (N.x * N.y))]

kable(tmp[N.x + N.y >= 500][order(dist.abs)])
c1 c2 N.x N.y dist.abs
Libya Afghanistan 253 323 12.8633
Afghanistan Libya 323 253 12.8633
Palestine Egypt 360 316 14.5588
Egypt Palestine 316 360 14.5588
Japan Italy 354 315 14.8907
Italy Japan 315 354 14.8907
Malaysia Afghanistan 262 323 16.0362
Spain Italy 258 315 16.3063
Pakistan Germany 453 371 19.6107
Germany Pakistan 371 453 19.6107
Syria Israel 647 561 21.4050
Israel Syria 561 647 21.4050
Iran Israel 496 561 22.6966
Iraq Syria 654 647 23.1034
Ukraine Russia 921 823 23.4717
Russia Ukraine 823 921 23.4717
Nigeria Pakistan 412 453 23.7334
China Canada 646 715 25.6509
Canada China 715 646 25.6509
France Canada 627 715 38.3235
India Australia 1561 974 41.0575
Australia India 974 1561 41.0575
Great Britain India 4015 1561 553.0469
United States Great Britain 10162 4015 1314.0598
# knn.int.abs[c1 %in% c("BR", "DE", "UA", "RU", "IL") & c2 %in% c("BR", "DE", "UA", "RU", "PS")][dist.abs != 0][order(dist.abs)]

knn.int.cos.pl <- knn.int.cos[order(dist.cos)]
knn.int.cos.pl <- knn.int.cos.pl[dist.cos < .5 & dist.cos > 0]
ggplot(knn.int.cos.pl, aes(x=reorder(c1, dist.cos), y=reorder(c2, dist.cos),
                       fill="white", label=as.character(round(dist.cos, 2)))) + 
  geom_tile(size=.1, color="white") + fte_theme() +
  geom_text(size=2, color=palette[9]) +
  scale_x_discrete(labels=map_country) + scale_y_discrete(labels=map_country) +
  theme(axis.text.x=element_text(size=7, angle=45, hjust=1),
        axis.text.y=element_text(size=7)) + 
  labs(x="Country", y="Country")

selected_pairs = list(c("IT", "ES"), c("NG", "IT"), c("EG", "AF"), 
                      c("JP", "IR"), c("MY", "PK"))

for(pair in selected_pairs) {
  print(paste(map_country(pair), collapse = " & "))
  tmp <- part.wrt.all[c.protagonist %in% pair]
  p <- ggplot(tmp,
              aes(x=reorder(c.interested, sc.events), 
                  y=sc.events, 
                  color=factor(c.protagonist, levels=pair),
                  shape=factor(c.protagonist, levels=pair))) + 
    geom_point(size=1, position="jitter", alpha=1) + 
    geom_smooth(aes(group=c.protagonist)) +
    # geom_line(aes(group=c.protagonist)) +
    # geom_smooth(aes(x=reorder(c.interested, avg.interest), y=avg.interest, group=c.protagonist), size=.5) +
    xlab("Interested countries") + 
    ylab("Interest") +
    fte_theme() + theme(axis.text.x = element_blank()) +
    theme(legend.justification=c(0, 1), legend.position=c(0,1),
          legend.background = element_rect(fill="transparent"),
          legend.key.height=unit(1.8,"line"),
          axis.text=element_text(size=6)) +
    # scale_y_continuous(breaks=seq(0, 1, by=0.1), limits=c(0, 1.1)) +
    scale_color_discrete(name="Protagonist Country",
                         breaks=pair,
                         labels=map_country(pair)) +
    scale_shape_discrete(name="Protagonist Country",
                         breaks=pair,
                         solid=F,
                         labels=map_country(pair)) 
    # scale_x_discrete(labels=map_country)
  
  print(p + ggtitle(paste(map_country(pair), collapse = " & ")))

  ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".eps"), collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".pdf"), collapse = ""), 
         p, dpi=pl.DPI, width=pl.W, height=pl.H)
  
}
## [1] "Italy & Spain"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "Nigeria & Italy"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "Egypt & Afghanistan"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "Japan & Iran"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

## [1] "Malaysia & Pakistan"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

List of country codes

kable(countries_list[order(country.code)])
country.code country.name
AD Andorra
AE United Arab Emirates
AF Afghanistan
AG Antigua and Barbuda
AI Anguilla
AL Albania
AM Armenia
AN Netherlands Antilles
AO Angola
AQ Antarctica
AR Argentina
AS American Samoa
AT Austria
AU Australia
AW Aruba
AX Åland
AZ Azerbaijan
BA Bosnia and Herzegovina
BB Barbados
BD Bangladesh
BE Belgium
BF Burkina Faso
BG Bulgaria
BH Bahrain
BI Burundi
BJ Benin
BL Saint Barthélemy
BM Bermuda
BN Brunei
BO Bolivia
BQ Bonaire
BR Brazil
BS Bahamas
BT Bhutan
BV Bouvet Island
BW Botswana
BY Belarus
BZ Belize
CA Canada
CC Cocos [Keeling] Islands
CD Democratic Republic of the Congo
CF Central African Republic
CG Republic of the Congo
CH Switzerland
CI Ivory Coast
CK Cook Islands
CL Chile
CM Cameroon
CN China
CO Colombia
CR Costa Rica
CS Serbia
CU Cuba
CV Cape Verde
CW Curacao
CX Christmas Island
CY Cyprus
CZ Czechia
DE Germany
DJ Djibouti
DK Denmark
DM Dominica
DO Dominican Republic
DZ Algeria
EC Ecuador
EE Estonia
EG Egypt
EH Western Sahara
ER Eritrea
ES Spain
ET Ethiopia
FI Finland
FJ Fiji
FK Falkland Islands
FM Micronesia
FO Faroe Islands
FR France
GA Gabon
GB Great Britain
GD Grenada
GE Georgia
GF French Guiana
GG Guernsey
GH Ghana
GI Gibraltar
GL Greenland
GM Gambia
GN Guinea
GP Guadeloupe
GQ Equatorial Guinea
GR Greece
GS South Georgia and the South Sandwich Islands
GT Guatemala
GU Guam
GW Guinea-Bissau
GY Guyana
HK Hong Kong
HM Heard Island and McDonald Islands
HN Honduras
HR Croatia
HT Haiti
HU Hungary
ID Indonesia
IE Ireland
IL Israel
IM Isle of Man
IN India
IO British Indian Ocean Territory
IQ Iraq
IR Iran
IS Iceland
IT Italy
JE Jersey
JM Jamaica
JO Jordan
JP Japan
KE Kenya
KG Kyrgyzstan
KH Cambodia
KI Kiribati
KM Comoros
KN Saint Kitts and Nevis
KP North Korea
KR South Korea
KW Kuwait
KY Cayman Islands
KZ Kazakhstan
LA Laos
LB Lebanon
LC Saint Lucia
LI Liechtenstein
LK Sri Lanka
LR Liberia
LS Lesotho
LT Lithuania
LU Luxembourg
LV Latvia
LY Libya
MA Morocco
MC Monaco
MD Moldova
ME Montenegro
MF Saint Martin
MG Madagascar
MH Marshall Islands
MK Macedonia
ML Mali
MM Myanmar [Burma]
MN Mongolia
MO Macao
MP Northern Mariana Islands
MQ Martinique
MR Mauritania
MS Montserrat
MT Malta
MU Mauritius
MV Maldives
MW Malawi
MX Mexico
MY Malaysia
MZ Mozambique
NC New Caledonia
NE Niger
NF Norfolk Island
NG Nigeria
NI Nicaragua
NL Netherlands
NO Norway
NP Nepal
NR Nauru
NU Niue
NZ New Zealand
OM Oman
PA Panama
PE Peru
PF French Polynesia
PG Papua
PH Philippines
PK Pakistan
PL Poland
PM Saint Pierre and Miquelon
PN Pitcairn Islands
PR Puerto Rico
PS Palestine
PT Portugal
PW Palau
PY Paraguay
QA Qatar
RE Réunion
RO Romania
RS Serbia
RU Russia
RW Rwanda
SA Saudi Arabia
SB Solomon Islands
SC Seychelles
SD Sudan
SE Sweden
SG Singapore
SH Saint Helena
SI Slovenia
SJ Svalbard and Jan Mayen
SK Slovakia
SL Sierra Leone
SM San Marino
SN Senegal
SO Somalia
SR Suriname
SS South Sudan
ST São Tomé and Príncipe
SV El Salvador
SX Sint Maarten
SY Syria
SZ Swaziland
TC Turks and Caicos Islands
TD Chad
TF French Southern Territories
TG Togo
TH Thailand
TJ Tajikistan
TK Tokelau
TL East Timor
TM Turkmenistan
TN Tunisia
TO Tonga
TR Turkey
TT Trinidad and Tobago
TV Tuvalu
TW Taiwan
TZ Tanzania
UA Ukraine
UG Uganda
UM U.S. Minor Outlying Islands
US United States
UY Uruguay
UZ Uzbekistan
VA Vatican City
VC Saint Vincent and the Grenadines
VE Venezuela
VG British Virgin Islands
VI U.S. Virgin Islands
VN Vietnam
VU Vanuatu
WF Wallis and Futuna
YE Yemen
YT Mayotte
ZA South Africa
ZM Zambia
ZW Zimbabwe
NA Namibia