# message = F, warning = F
library(tidyr)
library(dplyr)
library(ggplot2)
library(data.table)
library(knitr)
library(bit64)
library(extrafont)
library(scales)
library(grid)
library(RColorBrewer)
palette <- brewer.pal("Greys", n=9)
Protagonist countries
# component locations raw data (protagonist countries)
protagonists <- read.table('~/galean/scripts/queries/data/componentlocation.tsv', sep='\t', header=T, na.strings = '', stringsAsFactors = T)
prot.dt <- data.table(protagonists)
# write.table(prot.dt[, component_id, country_code], 'scripts/queries/data/country_component.txt', row.names=F)
prot <- prot.dt %>%
select(component_id, country_code, frequency) %>%
arrange(-frequency)
Interested countries
participants <- read.table('~/galean/scripts/queries/data/participation_data.txt', sep='\t', header=T, stringsAsFactors = F, na.strings = '')
part.tidy <- participants %>%
gather(country, tweets, -component_id) %>%
mutate(country = (toupper(country))) %>%
arrange(component_id, -tweets)
part.tidy$country[part.tidy$country == "IN."] <- "IN"
part.tidy$country <- factor(part.tidy$country)
part.tidy <- data.table(part.tidy)
Countries List
countries_list <- data.table(read.table('~/galean/scripts/queries/data/countries.txt', sep='\t', header=F, stringsAsFactors = T))
setnames(countries_list, c("V1", "V2"), c("country.code", "country.name"))
Overall summary:
events <- 25481
tweets <- 193447671
users <- 26127625
summ <- data.frame(events, tweets, users)
kable(summ, row.names = FALSE, format.args=list(big.mark=','))
events | tweets | users |
---|---|---|
25,481 | 193,447,671 | 26,127,625 |
Protagonist countries:
# component_id: event
# frequency: no. of times the country was mentioned in the tweets of the event
head(prot)
## component_id country_code frequency
## 1: 17658 US 584530
## 2: 24041 NP 547669
## 3: 17579 FR 349313
## 4: 17406 FR 345206
## 5: 24089 NP 341518
## 6: 24391 GB 245290
summary(prot)
## component_id country_code frequency
## Min. : 1 US :10162 Min. : 30
## 1st Qu.: 7665 GB : 4015 1st Qu.: 100
## Median :15180 IN : 1561 Median : 285
## Mean :14132 AU : 974 Mean : 1918
## 3rd Qu.:20635 UA : 921 3rd Qu.: 982
## Max. :25481 RU : 823 Max. :584530
## (Other):14161
# ggplot(prot, aes(x=1, y=frequency)) + geom_boxplot()
Interested countries:
head(part.tidy)
## component_id country tweets
## 1: 1 US 319
## 2: 1 CO 30
## 3: 1 GB 18
## 4: 1 ID 7
## 5: 1 CA 6
## 6: 1 JP 5
summary(part.tidy)
## component_id country tweets
## Min. : 1 AD : 20066 Min. : 0.00
## 1st Qu.: 6638 AE : 20066 1st Qu.: 0.00
## Median :13566 AF : 20066 Median : 0.00
## Mean :13181 AG : 20066 Mean : 6.63
## 3rd Qu.:19738 AI : 20066 3rd Qu.: 0.00
## Max. :25481 AL : 20066 Max. :108599.00
## (Other):4896104
# ggplot(part.tidy, aes(x=1, y=tweets)) + geom_boxplot()
Remove countries with few protagonized events (< median) and remove events with few tweets (< median):
# prot.events_per_country <- prot[, .N, by=country_code]
# summary(prot.events_per_country$N)
# to_remove <- prot.events_per_country[N < 18.5]$country_code
# prot <- prot[!(country_code %in% to_remove)]
# summary(prot$frequency)
# to_remove <- prot[frequency < 288]$component_id
# prot <- prot[!(component_id %in% to_remove)]
prot.per_country <- prot[, .N, by=country_code]
n <- nrow(prot.per_country)
prot.per_country.top25 <- prot.per_country[order(N), ][(n-25):n]
p <- ggplot(prot.per_country.top25, aes(x=factor(country_code, levels=country_code), y=N, label=comma(N))) +
geom_bar(stat="identity") + coord_flip() + geom_text(size=2.5, hjust=-.1, color=palette[6]) +
scale_y_continuous(labels=comma, breaks=seq(0, 12000, by=1000), limits = c(0, 12000)) +
scale_x_discrete(labels=map_country) +
fte_theme() + geom_hline(yintercept=0, size=0.4, color="black") +
xlab("Country") + ylab("Events")
print(p + ggtitle("Top 25 countries protagonizing events"))
ggsave(paste(c(PDF_PATH, "protagonist-bias.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "protagonist-bias.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
quantile(part.tidy$tweets, c(.8, .9, .95, .99, .999, .9999))
## 80% 90% 95% 99% 99.9% 99.99%
## 0.000 2.000 8.000 77.000 1026.000 6053.901
# use .99 percentile
part.tidy.p99 <- part.tidy[tweets >= 77, ]
part.per_country <- part.tidy.p99[, .N, by=country]
n <- nrow(part.per_country)
part.per_country.top25 <- part.per_country[order(N)[(n - 25):n], ]
p <- ggplot(part.per_country.top25, aes(x=factor(country, levels=country), y=N, label=comma(N))) +
geom_bar(stat="identity") + coord_flip() +
geom_text(size=2.5, hjust=-.1, color=palette[6]) +
scale_y_continuous(labels=comma, breaks=seq(0, 12000, by=1000), limits = c(0, 12000)) +
scale_x_discrete(labels=map_country) +
fte_theme() + geom_hline(yintercept=0, size=0.4, color="black") +
xlab("Country") + ylab("Events")
print(p + ggtitle("Top 25 countries interested in events"))
ggsave(paste(c(PDF_PATH, "interest-bias.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "interest-bias.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
part.sum_per_country <- part.tidy.p99[, sum(tweets), by=country]
n <- nrow(part.sum_per_country)
part.sum_per_country.top25 <- part.sum_per_country[order(V1)[(n-25):n]]
p <- ggplot(part.sum_per_country.top25, aes(x=factor(country, levels=country), y=V1, label=comma(V1))) +
geom_bar(stat="identity") + coord_flip() +
geom_text(size=2.5, hjust=-.1, color=palette[6]) +
scale_y_continuous(labels=comma, breaks=seq(0, 14e6, by=2e6), limits = c(0, 14e6)) +
scale_x_discrete(labels=map_country) +
fte_theme() + geom_hline(yintercept=0, size=0.4, color="black") +
xlab("Country") + ylab("Tweets")
print(p + ggtitle("Top 25 countries with tweets"))
ggsave(paste(c(PDF_PATH, "tweets-per-country.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "tweets-per-country.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
For most of the users, it was not possible to identify a location. The rest is distributed mostly among US, GB, CA and ID (Indonesia).
user_countries <- read.table('~/galean/scripts/queries/data/locations_distribution.txt', sep='\t', stringsAsFactors = FALSE, na.strings = "")
user_countries <- user_countries %>%
mutate(country = toupper(V1),
frequency = V2) %>%
select(country, frequency)
user_countries <- data.table(user_countries)
user_countries <- user_countries[order(frequency)]
n <- nrow(user_countries)
user_countries_pl <- user_countries[(n - 25):n]
p <- ggplot(user_countries_pl, aes(x=factor(country, levels=country), y=frequency, label=comma(frequency))) +
geom_bar(stat="identity") + coord_flip() + xlab("Country") + ylab("Users") +
geom_text(size=2.5, hjust=-.1, color=palette[6]) +
scale_y_continuous(labels=comma, breaks=seq(0, 17.5e6, by=2e6), limits = c(0, 17.5e6)) +
scale_x_discrete(labels=map_country) +
fte_theme() + geom_hline(yintercept=0, size=0.4, color="black")
print(p + ggtitle("Users per country tweeting about events"))
ggsave(paste(c(PDF_PATH, "users-per-country.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "users-per-country.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
p <- ggplot(user_countries_pl[country != "<NA>"],
aes(x=factor(country, levels=country), y=frequency, label=comma(frequency))) +
geom_bar(stat="identity") + coord_flip() + xlab("Country") + ylab("Users") +
geom_text(size=2.5, hjust=-.1, color=palette[6]) +
scale_y_continuous(labels=comma, breaks=seq(0, 4e6, by=.5e6), limits = c(0, 4e6)) +
scale_x_discrete(labels=map_country) +
fte_theme() + geom_hline(yintercept=0, size=0.4, color="black")
print(p + ggtitle("Users per country tweeting about events"))
ggsave(paste(c(PDF_PATH, "users-per-country-f.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "users-per-country-f.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
A country is represented as a vector with respect to all other countries \(C\):
\[c_i = \Big( \frac{\text{total events protagonized by } c_i \text{ and } c_j}{\text{total events protagonized by } c_i}, c_j \in C \Big) \]
countries <- unique(prot, by=c("country_code"))[, country_code]
coprot <- prot %>% left_join(prot, by="component_id")
# do not consider self protagonism in events with more than 1 protagonist
# ...no se para qué, no lo uso
# coprot <- coprot[coprot[, country_code.x != country_code.y | .N == 1, by=component_id]$V1]
# eventos protagonizados por pais
prot.by_country <- prot[, .N, by=country_code]
coprot.vector <- list()
for(country in countries) {
total_events_ci <- prot.by_country[country_code == country, N]
coprot_i <- coprot[country_code.x == country]
coprot_freq <- coprot_i[, .N, by=country_code.y]
coprot.vector[[country]] <- coprot_freq[, per := coprot_freq$N / total_events_ci]
}
Bar length represents the fraction of events protagonized by each country, with respect to total events protagonized by main country (in the title of each plot).
selected_countries <- c("US", "GB", "CA", "NG", "UA", "RU", "JP", "CN", "PS", "IL")
for(country in selected_countries) {
tmp <- coprot.vector[[country]][country_code.y != country][order(per)]
n <- nrow(tmp)
p <- ggplot(tmp[(n - 15): n], aes(x=factor(country_code.y, levels=country_code.y),
y=per, label=as.character(round(per, 4)))) +
geom_bar(stat="identity") + coord_flip() + xlab("Country") + ylab("Co-protagonism") +
geom_text(size=2.5, hjust=-.1, color=palette[6]) +
scale_y_continuous(breaks=seq(0, 1, by=.125), limits = c(0, .5)) +
scale_x_discrete(labels=map_country) +
fte_theme() + geom_hline(yintercept=0, size=0.4, color="black")
print(p + ggtitle(map_country(country)))
ggsave(paste(c(PDF_PATH, "co-prot-", country, ".eps"), collapse = ""), p,
dpi=pl.DPI/1.5, width=pl.W-3, height=pl.H-1)
ggsave(paste(c(PDF_PATH, "co-prot-", country, ".pdf"), collapse = ""), p,
dpi=pl.DPI/1.5, width=pl.W-3, height=pl.H-1)
}
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (geom_text).
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (geom_text).
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (geom_text).
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (geom_text).
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (geom_text).
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (geom_text).
tmp <- data.table(prot %>% left_join(prot, "component_id"))[, mean(length(component_id)), by=country_code.x]
tmp <- tmp[, per:=tmp$V1 / sum(tmp$V1)][order(per)]
n <- nrow(tmp)
p <- ggplot(tmp[(n - 15): n], aes(x=factor(country_code.x, levels=country_code.x),
y=per, label=as.character(round(per, 4)))) +
geom_bar(stat="identity") + coord_flip() + xlab("Country") + ylab("Co-protagonism") +
geom_text(size=2.5, hjust=-.1, color=palette[6]) +
scale_y_continuous(breaks=seq(0, 1, by=.125), limits = c(0, .5)) +
scale_x_discrete(labels=map_country) +
fte_theme() + geom_hline(yintercept=0, size=0.4, color="black")
print(p + ggtitle("All"))
ggsave(paste(c(PDF_PATH, "co-prot-all", ".eps"), collapse = ""), p,
dpi=pl.DPI/1.5, width=pl.W-3, height=pl.H-1)
ggsave(paste(c(PDF_PATH, "co-prot-all", ".pdf"), collapse = ""), p,
dpi=pl.DPI/1.5, width=pl.W-3, height=pl.H-1)
Remake plots from Sec 6.1:
prot.comp <- list()
to_compare <- c("EC", "HN", "IL", "PS", "KP", "KR", "BD", "US", "GB", "UA", "RU", "MY", "VN", "CN", "JP", "CA", "NG")
for(cnt in to_compare) {
covector <- c()
for(i in seq(length(countries))) {
c_i <- as.character(countries[i])
per <- coprot.vector[[cnt]][country_code.y == c_i, per]
if(length(per) == 0) {
covector <- c(covector, 0)
} else {
covector <- c(covector, per)
}
}
names(covector) <- countries
prot.comp[[cnt]] <- covector
}
prot.comp_sel <- data.table(EC=prot.comp[["EC"]],
HN=prot.comp[["HN"]],
IL=prot.comp[["IL"]],
PS=prot.comp[["PS"]],
KP=prot.comp[["KP"]],
KR=prot.comp[["KR"]],
BD=prot.comp[["BD"]],
US=prot.comp[["US"]],
GB=prot.comp[["GB"]],
UA=prot.comp[["UA"]],
RU=prot.comp[["RU"]],
MY=prot.comp[["MY"]],
VN=prot.comp[["VN"]],
CN=prot.comp[["CN"]],
JP=prot.comp[["JP"]],
CA=prot.comp[["CA"]],
NG=prot.comp[["NG"]],
country=countries)
prot.comp_sel2 <- prot.comp_sel %>%
gather(y, value, -country) %>%
mutate(x = country) %>%
select(x, y, value)
prot.comp_sel2 <- data.table(prot.comp_sel2)
pairs <- list(c("EC", "HN"), c("IL", "PS"), c("KP", "KR"), c("BD", "US"), c("GB", "US"),
c("RU", "UA"), c("EC", "US"), c("IL", "KP"), c("CN", "MY"), c("JP", "CN"),
c("CA", "NG"))
for(pair in pairs) {
print(paste(map_country(pair), collapse = " & "))
tmp <- prot.comp_sel2[y %in% pair]
p <- ggplot(tmp, aes(x=reorder(x, value),
y=value,
group=factor(y, levels=pair),
color=factor(y, levels=pair),
shape=factor(y, levels=pair))) +
geom_point(size=1, position="jitter", alpha=1) +
# geom_smooth(aes(x=reorder(x, value), y=value, group=y), size=.5) +
xlab("Countries") +
ylab("Co-protagonism") +
fte_theme() +
# scale_x_discrete(labels=c())
theme(legend.justification=c(0, 1), legend.position=c(0,1),
legend.background = element_rect(fill="transparent"),
legend.key.height=unit(1.8,"line"),
axis.text=element_text(size=6),
axis.text.x = element_blank()) +
scale_y_continuous(breaks=seq(0, 1, by=0.1), limits=c(0, 1.1)) +
scale_color_discrete(name="Protagonist Country",
breaks=pair,
labels=map_country(pair)) +
scale_shape_discrete(name="Protagonist Country",
solid = F,
breaks=pair,
labels=map_country(pair))
# scale_x_discrete(labels=map_country)
print(p + ggtitle(paste(map_country(pair), collapse = " & ")))
ggsave(paste(c(PDF_PATH, paste(c("co-prot", pair), collapse = "-"), ".eps"), collapse = ""),
p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, paste(c("co-prot", pair), collapse = "-"), ".pdf"), collapse = ""),
p, dpi=pl.DPI, width=pl.W, height=pl.H)
}
## [1] "Ecuador & Honduras"
## Warning: Removed 182 rows containing missing values (geom_point).
## Warning: Removed 182 rows containing missing values (geom_point).
## Warning: Removed 182 rows containing missing values (geom_point).
## [1] "Israel & Palestine"
## Warning: Removed 117 rows containing missing values (geom_point).
## Warning: Removed 107 rows containing missing values (geom_point).
## Warning: Removed 129 rows containing missing values (geom_point).
## [1] "North Korea & South Korea"
## Warning: Removed 124 rows containing missing values (geom_point).
## Warning: Removed 146 rows containing missing values (geom_point).
## Warning: Removed 130 rows containing missing values (geom_point).
## [1] "Bangladesh & United States"
## Warning: Removed 79 rows containing missing values (geom_point).
## Warning: Removed 81 rows containing missing values (geom_point).
## Warning: Removed 76 rows containing missing values (geom_point).
## [1] "Great Britain & United States"
## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 23 rows containing missing values (geom_point).
## Warning: Removed 31 rows containing missing values (geom_point).
## [1] "Russia & Ukraine"
## Warning: Removed 85 rows containing missing values (geom_point).
## Warning: Removed 94 rows containing missing values (geom_point).
## Warning: Removed 99 rows containing missing values (geom_point).
## [1] "Ecuador & United States"
## Warning: Removed 99 rows containing missing values (geom_point).
## Warning: Removed 93 rows containing missing values (geom_point).
## Warning: Removed 87 rows containing missing values (geom_point).
## [1] "Israel & North Korea"
## Warning: Removed 113 rows containing missing values (geom_point).
## Warning: Removed 116 rows containing missing values (geom_point).
## Warning: Removed 122 rows containing missing values (geom_point).
## [1] "China & Malaysia"
## Warning: Removed 109 rows containing missing values (geom_point).
## Warning: Removed 113 rows containing missing values (geom_point).
## Warning: Removed 119 rows containing missing values (geom_point).
## [1] "Japan & China"
## Warning: Removed 105 rows containing missing values (geom_point).
## Warning: Removed 125 rows containing missing values (geom_point).
## Warning: Removed 96 rows containing missing values (geom_point).
## [1] "Canada & Nigeria"
## Warning: Removed 101 rows containing missing values (geom_point).
## Warning: Removed 107 rows containing missing values (geom_point).
## Warning: Removed 91 rows containing missing values (geom_point).
Export vectors to files:
for(c in countries) {
x = coprot.vector[[c]]
f = paste(c("~/galean/scripts/queries/data/protagonist_vectors/", c, ".txt"), collapse = "")
write.table(x, f, row.names = F, sep = "\t")
}
Euclidean distance is not suitable for similarity between countries (vectors), due to the high sparsity of the vectors. Cosine similarity is better.
knn.cos <- data.table(read.table('~/galean/scripts/queries/data/knn_cos.txt', header=F, sep='\t', stringsAsFactors = T, na.strings = ''))
knn.euc <- data.table(read.table('~/galean/scripts/queries/data/knn_euc.txt', header=F, sep='\t', stringsAsFactors = T, na.strings = ''))
setnames(knn.cos, c("V1", "V2", "V3"), c("c1", "c2", "dist.cos"))
setnames(knn.euc, c("V1", "V2", "V3"), c("c1", "c2", "dist.euc"))
knn.euc.us <- knn.euc[c1 == "US" & c2 != "US"][order(dist.euc)]
ggplot(knn.euc.us, aes(x=factor(c2, levels=c2), y=1-dist.euc)) + geom_bar(stat="identity") + coord_flip() +
xlab("Country") + ylab("Euclidean distance") + ggtitle("Most similar countries to US (euclidean distance) (higher is more similar)")
# US
knn.cos.us <- knn.cos[c1 == "US" & c2 != "US"][order(dist.cos)]
ggplot(knn.cos.us, aes(x=factor(c2, levels=c2), y=(1-dist.cos))) + geom_bar(stat="identity") + coord_flip() +
xlab("Country") + ylab("Cosine distance") + ggtitle("Most similar countries to US (cosine distance) (higher is more similar)")
# GB
knn.cos.gb <- knn.cos[c1 == "GB" & c2 != "GB"][order(dist.cos)]
ggplot(knn.cos.gb, aes(x=factor(c2, levels=c2), y=1-dist.cos)) + geom_bar(stat="identity") + coord_flip() +
xlab("Country") + ylab("Cosine distance") + ggtitle("Most similar countries to GB (cosine distance) (higher is more similar)")
# UA
knn.cos.ua <- knn.cos[c1 == "UA" & c2 != "UA"][order(dist.cos)]
ggplot(knn.cos.ua, aes(x=factor(c2, levels=c2), y=1-dist.cos)) + geom_bar(stat="identity") + coord_flip() +
xlab("Country") + ylab("Cosine distance") + ggtitle("Most similar countries to Ukraine (cosine distance) (higher is more similar)")
# VE
knn.cos.ve <- knn.cos[c1 == "VE" & c2 != "VE"][order(dist.cos)]
ggplot(knn.cos.ve, aes(x=factor(c2, levels=c2), y=1-dist.cos)) + geom_bar(stat="identity") + coord_flip() +
xlab("Country") + ylab("Cosine distance") + ggtitle("Most similar countries to Venezuela (cosine distance) (higher is more similar)")
# CL
knn.cos.cl <- knn.cos[c1 == "CL" & c2 != "CL"][order(dist.cos)]
ggplot(knn.cos.cl, aes(x=factor(c2, levels=c2), y=1-dist.cos)) + geom_bar(stat="identity") + coord_flip() +
xlab("Country") + ylab("Cosine distance") + ggtitle("Most similar countries to Chile (cosine distance) (higher is more similar)")
Get all countries with their most similar counterparts:
one_nn <- knn.cos[dist.cos != 0]
one_nn <- one_nn[one_nn[, .I[dist.cos == min(dist.cos)], by=c1]$V1]
one_nn <- na.omit(one_nn[order(dist.cos)])
one_nn.table <- one_nn[, list(map_country(c1), map_country(c2), dist.cos)]
one_nn.table <- one_nn.table[order(dist.cos)]
kable(one_nn.table[1:30]) #, format="latex", booktabs=T)
V1 | V2 | dist.cos |
---|---|---|
Russia | Ukraine | 0.0079 |
Ukraine | Russia | 0.0079 |
Canada | Nigeria | 0.0211 |
Nigeria | Canada | 0.0211 |
China | Japan | 0.0239 |
Japan | China | 0.0239 |
France | Italy | 0.0259 |
Italy | France | 0.0259 |
Peru | Canada | 0.0266 |
Iraq | Syria | 0.0301 |
Syria | Iraq | 0.0301 |
Afghanistan | Cuba | 0.0314 |
Cuba | Afghanistan | 0.0314 |
Spain | France | 0.0320 |
Mexico | Netherlands | 0.0338 |
Netherlands | Mexico | 0.0338 |
Kenya | Nigeria | 0.0355 |
Puerto Rico | Seychelles | 0.0362 |
Seychelles | Puerto Rico | 0.0362 |
Iran | Iraq | 0.0364 |
Libya | Iraq | 0.0365 |
Venezuela | Cuba | 0.0381 |
Germany | France | 0.0384 |
Australia | India | 0.0386 |
India | Australia | 0.0386 |
United Arab Emirates | South Korea | 0.0396 |
South Korea | United Arab Emirates | 0.0396 |
Sudan | Libya | 0.0404 |
Antarctica | Canada | 0.0407 |
Nepal | South Africa | 0.0429 |
knn.cos.pl <- knn.cos[order(dist.cos)]
knn.cos.pl <- knn.cos.pl[dist.cos < .05 & dist.cos > 0]
ggplot(knn.cos.pl, aes(x=reorder(c1, dist.cos), y=reorder(c2, dist.cos),
fill="white", label=as.character(round(dist.cos, 2)))) +
geom_tile(size=.1, color="white") + fte_theme() +
geom_text(size=2, color=palette[9]) +
# scale_x_discrete(labels=map_country) + scale_y_discrete(labels=map_country) +
theme(axis.text.x=element_text(size=7, angle=45, hjust=1),
axis.text.y=element_text(size=7)) +
labs(x="Country", y="Country")
A country is represented as the vector of Jaccard similarities between it and the rest:
\[ c_i = \Big( \frac{\text{events protagonized by } c_i \text{ and } c_j}{\text{events protagonized by } c_i \text{ or } c_j}, c_j \in C \Big)\]
The co-protagonism between two countries is defined as follows:
\[ s(c1, c2) = \frac{|E(c1) \cap E(c2)|}{|E(c1) \cup E(c2)|} \]
Where \(E(c1)\) is the amount of events protagonized by \(c1\).
quantile(prot[, .N, by=country_code]$N, c(.5, .75, .9, .95, .99))
## 50% 75% 90% 95% 99%
## 18.50 80.25 315.50 631.75 1683.70
prot.events_per_country <- prot[, .N, by=country_code]
summary(prot.events_per_country$N)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 4.00 18.50 166.40 80.25 10160.00
to_remove <- prot.events_per_country[N < mean(prot.events_per_country$N)]$country_code
prot.f <- prot[!(country_code %in% to_remove)]
countries <- unique(prot.f, by=c("country_code"))[, country_code]
cis <- c()
cjs <- c()
jacs <- c()
for(ci in countries) {
events_ci <- prot[country_code == ci]$component_id
for(cj in countries) {
events_cj <- prot[country_code == cj]$component_id
jacc <- length(intersect(events_ci, events_cj)) / length(union(events_ci, events_cj))
cis <- c(cis, ci)
cjs <- c(cjs, cj)
jacs <- c(jacs, jacc)
}
}
prot.sim <- data.table(country.x = cis, country.y = cjs, jacc.sim = jacs)
prot.most_similar <- prot.sim[jacc.sim > 0 & jacc.sim < 1][order(-jacc.sim)]
setnames(prot.most_similar, "country.x", "country_code")
tmp <- prot.most_similar %>% left_join(prot.by_country, "country_code")
setnames(tmp, c("country_code", "country.y"), c("country.x", "country_code"))
tmp <- tmp %>% left_join(prot.by_country, "country_code")
setnames(tmp, c("country_code"), c("country.y"))
prot.most_similar <- tmp
prot.most_similar <- prot.most_similar[order(-jacc.sim)]
prot.most_similar.table <- prot.most_similar[1:50][, list(map_country(country.x),
map_country(country.y),
N.x, N.y,
jacc.sim)]
kable(prot.most_similar.table[1:50])#, format="latex", booktabs=T)
V1 | V2 | N.x | N.y | jacc.sim |
---|---|---|---|---|
Palestine | Israel | 360 | 561 | 0.2863128 |
Israel | Palestine | 561 | 360 | 0.2863128 |
Ukraine | Russia | 921 | 823 | 0.2094313 |
Russia | Ukraine | 823 | 921 | 0.2094313 |
United States | Great Britain | 10162 | 4015 | 0.0966120 |
Great Britain | United States | 4015 | 10162 | 0.0966120 |
Syria | Iraq | 647 | 654 | 0.0832639 |
Iraq | Syria | 654 | 647 | 0.0832639 |
Pakistan | India | 453 | 1561 | 0.0752803 |
India | Pakistan | 1561 | 453 | 0.0752803 |
Iran | Israel | 496 | 561 | 0.0698381 |
Israel | Iran | 561 | 496 | 0.0698381 |
Japan | China | 354 | 646 | 0.0604454 |
China | Japan | 646 | 354 | 0.0604454 |
France | Germany | 627 | 371 | 0.0583245 |
Germany | France | 371 | 627 | 0.0583245 |
Great Britain | Australia | 4015 | 974 | 0.0576638 |
Australia | Great Britain | 974 | 4015 | 0.0576638 |
Germany | Brazil | 371 | 236 | 0.0574913 |
Brazil | Germany | 236 | 371 | 0.0574913 |
Turkey | Syria | 198 | 647 | 0.0536160 |
Syria | Turkey | 647 | 198 | 0.0536160 |
Iran | Iraq | 496 | 654 | 0.0511883 |
Iraq | Iran | 654 | 496 | 0.0511883 |
Malaysia | Australia | 262 | 974 | 0.0492360 |
Australia | Malaysia | 974 | 262 | 0.0492360 |
India | Australia | 1561 | 974 | 0.0475207 |
Australia | India | 974 | 1561 | 0.0475207 |
Great Britain | Canada | 4015 | 715 | 0.0443807 |
Canada | Great Britain | 715 | 4015 | 0.0443807 |
Libya | Egypt | 253 | 316 | 0.0440367 |
Egypt | Libya | 316 | 253 | 0.0440367 |
India | Great Britain | 1561 | 4015 | 0.0436085 |
Great Britain | India | 4015 | 1561 | 0.0436085 |
Palestine | Egypt | 360 | 316 | 0.0432099 |
Egypt | Palestine | 316 | 360 | 0.0432099 |
Syria | Iran | 647 | 496 | 0.0428832 |
Iran | Syria | 496 | 647 | 0.0428832 |
Spain | Germany | 258 | 371 | 0.0413907 |
Germany | Spain | 371 | 258 | 0.0413907 |
United States | Canada | 10162 | 715 | 0.0370900 |
Canada | United States | 715 | 10162 | 0.0370900 |
Great Britain | France | 4015 | 627 | 0.0363921 |
France | Great Britain | 627 | 4015 | 0.0363921 |
Yemen | Iran | 202 | 496 | 0.0356083 |
Iran | Yemen | 496 | 202 | 0.0356083 |
India | China | 1561 | 646 | 0.0346929 |
China | India | 646 | 1561 | 0.0346929 |
United States | India | 10162 | 1561 | 0.0345041 |
India | United States | 1561 | 10162 | 0.0345041 |
ggplot(prot.most_similar, aes(x=reorder(country.x, -jacc.sim), y=reorder(country.y, -jacc.sim),
fill=jacc.sim, label=as.character(round(jacc.sim, 2)))) +
geom_tile(size=.1) + fte_theme() +
geom_text(size=2, color=palette[1]) +
scale_x_discrete(labels=map_country) + scale_y_discrete(labels=map_country) +
theme(axis.text.x=element_text(size=7, angle=45, hjust=1),
axis.text.y=element_text(size=7)) +
labs(x="Country", y="Country")+ scale_fill_gradient(trans="log")
selected_countries <- c("IL", "PS", "UA", "RU", "GB", "US", "IQ", "SY")
for(country in selected_countries) {
tmp <- prot.sim[country.x == country][order(-jacc.sim)][jacc.sim != 1][1:10]
tmp <- tmp[, list(Country=map_country(country.y), Similarity=jacc.sim)]
t <- kable(tmp)#, format="latex", booktabs=T)
print(map_country(country))
print(t)
}
## IL
## "Israel"
##
##
## Country Similarity
## -------------- -----------
## Palestine 0.2863128
## Iran 0.0698381
## Egypt 0.0317647
## Syria 0.0307167
## France 0.0259067
## United States 0.0208492
## Iraq 0.0201511
## Pakistan 0.0201207
## Great Britain 0.0184732
## Ukraine 0.0178571
## PS
## "Palestine"
##
##
## Country Similarity
## -------------- -----------
## Israel 0.2863128
## Egypt 0.0432099
## Nigeria 0.0157895
## Pakistan 0.0149813
## Iraq 0.0129870
## Great Britain 0.0115607
## Ukraine 0.0110497
## Syria 0.0110442
## China 0.0100402
## United States 0.0090142
## UA
## "Ukraine"
##
##
## Country Similarity
## -------------- -----------
## Russia 0.2094313
## United States 0.0249699
## France 0.0238095
## Germany 0.0213439
## Syria 0.0188434
## Great Britain 0.0183619
## Israel 0.0178571
## Malaysia 0.0154506
## Iraq 0.0148196
## Nigeria 0.0144597
## RU
## "Russia"
##
##
## Country Similarity
## -------------- -----------
## Ukraine 0.2094313
## France 0.0305615
## Canada 0.0301407
## Syria 0.0286914
## United States 0.0285581
## Germany 0.0275387
## China 0.0265549
## Great Britain 0.0241321
## Iran 0.0240683
## Turkey 0.0179462
## GB
## "Great Britain"
##
##
## Country Similarity
## -------------- -----------
## United States 0.0966120
## Australia 0.0576638
## Canada 0.0443807
## India 0.0436085
## France 0.0363921
## Syria 0.0311878
## Nigeria 0.0257183
## Russia 0.0241321
## Pakistan 0.0238313
## China 0.0230465
## US
## "United States"
##
##
## Country Similarity
## -------------- -----------
## Great Britain 0.0966120
## Canada 0.0370900
## India 0.0345041
## Australia 0.0319711
## Iraq 0.0288215
## Russia 0.0285581
## Iran 0.0271781
## China 0.0256216
## Syria 0.0252300
## Ukraine 0.0249699
## IQ
## "Iraq"
##
##
## Country Similarity
## -------------- -----------
## Syria 0.0832639
## Iran 0.0511883
## United States 0.0288215
## Libya 0.0283447
## Turkey 0.0228091
## Great Britain 0.0205464
## Israel 0.0201511
## Afghanistan 0.0198330
## Pakistan 0.0193370
## Nigeria 0.0152381
## SY
## "Syria"
##
##
## Country Similarity
## -------------- -----------
## Iraq 0.0832639
## Turkey 0.0536160
## Iran 0.0428832
## Great Britain 0.0311878
## Israel 0.0307167
## Russia 0.0286914
## United States 0.0252300
## Libya 0.0250569
## France 0.0208333
## Ukraine 0.0188434
The interest of a country is the fraction of the maximum number of tweets it has published in one event.
part.filtered <- part.tidy[tweets > 0]
setkey(part.filtered, component_id)
part.frac <- part.filtered[, list(component_id, tweets, lapply(.SD, function(x) as.numeric(x / max(x)))$tweets), by=country]
setnames(part.frac, "V3", "interest")
Most interest in concentrated below \(0.25\).
p <- ggplot(part.frac, aes(x=interest)) + geom_histogram(binwidth = 0.01) +
fte_theme() + geom_hline(yintercept=0, size=0.4, color="black") +
xlab("Interest") + ylab("Frequency") + scale_y_continuous(labels=comma)
print(p)
ggsave(paste(c(PDF_PATH, "interest-hist.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "interest-hist.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
These countries have more than one event with interest equal to 1.0, possibly meaning that they have few tweets:
(part.few <- part.frac[interest == 1, which(.N > 1), by=country])
## country V1
## 1: DZ 1
## 2: DM 1
## 3: CV 1
## 4: LC 1
## 5: PW 1
## 6: TJ 1
## 7: SH 1
## 8: SM 1
## 9: AI 1
## 10: BV 1
## 11: CC 1
## 12: TK 1
## 13: KY 1
## 14: HM 1
## 15: CX 1
(part.few2 <- part.filtered[country %in% part.few$country])
## component_id country tweets
## 1: 4 DZ 1
## 2: 7 DZ 1
## 3: 11 DM 1
## 4: 12 CV 1
## 5: 17 DM 1
## ---
## 7751: 25473 DZ 1
## 7752: 25474 DZ 8
## 7753: 25476 DZ 3
## 7754: 25477 TJ 1
## 7755: 25478 DZ 1
quantile(part.few2$tweets, c(.5, .75, .9, .99, .999))
## 50% 75% 90% 99% 99.9%
## 1.000 3.000 6.000 26.000 61.246
Re-filter the table to get rid of countries with very low tweets (90 percentile):
part.filtered <- part.tidy[tweets > 6]
setkey(part.filtered, component_id)
part.frac <- part.filtered[, list(component_id,
tweets,
lapply(.SD, function(x) as.numeric(x / max(x)))$tweets),
by=country]
setnames(part.frac, "V3", "interest")
part.frac[, country := factor(country)]
p <- ggplot(part.frac, aes(x=interest)) + geom_histogram(binwidth = 0.01) +
fte_theme() + geom_hline(yintercept=0, size=0.4, color="black") +
xlab("Interest") + ylab("Frequency") + scale_y_continuous(labels=comma)
print(p)
ggsave(paste(c(PDF_PATH, "interest-hist-f.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "interest-hist-f.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
selected_countries <- c("US", "GB", "UA", "RU", "BR", "CL", "SY", "IQ", "IL", "PS")
p <- ggplot(part.frac[country %in% selected_countries], aes(x=country, y=interest)) +
geom_boxplot() + fte_theme() + scale_x_discrete(labels=map_country) +
xlab("") + ylab("Interest")
print(p + ggtitle("Distribution of interest of selected countries"))
ggsave(paste(c(PDF_PATH, "interest-dist.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "interest-dist.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
Re-make the plots of Fig. 6.
Each plot is the average interest of each country (x-axis) to the events protagonized by the two selected countries.
part.wrt.all <- part.frac %>%
full_join(prot, by=c("component_id")) %>%
filter(!is.na(country_code)) %>%
mutate(c.interested=country,
c.protagonist=country_code) %>%
select(component_id, c.interested, c.protagonist, interest) %>%
group_by(c.interested, c.protagonist) %>%
summarise(avg.interest=mean(interest))
part.wrt.all <- data.table(na.omit(part.wrt.all))
selected_pairs = list(c("RU", "UA"), c("IL", "PS"), c("BR", "DE"),
c("GB", "US"), c("BR", "US"), c("DE", "IL"),
c("BR", "PS"), c("MY", "VN"), c("CN", "MY"),
c("BO", "GA"), c("PY", "TJ"), c("FO", "SJ"),
c("MY", "UA"), c("AR", "BR"), c("CS", "TM"))
for(pair in selected_pairs) {
print(paste(map_country(pair), collapse = " & "))
tmp <- part.wrt.all[c.protagonist %in% pair]
p <- ggplot(tmp,
aes(x=reorder(c.interested, avg.interest),
y=avg.interest,
color=factor(c.protagonist, levels=pair),
shape=factor(c.protagonist, levels=pair))) +
geom_point(size=1, position="jitter", alpha=1) +
# geom_smooth(aes(x=reorder(c.interested, avg.interest), y=avg.interest, group=c.protagonist), size=.5) +
xlab("Interested countries") +
ylab("Interest") +
fte_theme() + theme(axis.text.x = element_blank()) +
theme(legend.justification=c(0, 1), legend.position=c(0,1),
legend.background = element_rect(fill="transparent"),
legend.key.height=unit(1.8,"line"),
axis.text=element_text(size=6)) +
scale_y_continuous(breaks=seq(0, 1, by=0.1), limits=c(0, 1.1)) +
scale_color_discrete(name="Protagonist Country",
breaks=pair,
labels=map_country(pair)) +
scale_shape_discrete(name="Protagonist Country",
breaks=pair,
solid=F,
labels=map_country(pair))
# scale_x_discrete(labels=map_country)
print(p + ggtitle(paste(map_country(pair), collapse = " & ")))
ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".eps"), collapse = ""),
p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".pdf"), collapse = ""),
p, dpi=pl.DPI, width=pl.W, height=pl.H)
}
## [1] "Russia & Ukraine"
## [1] "Israel & Palestine"
## [1] "Brazil & Germany"
## [1] "Great Britain & United States"
## [1] "Brazil & United States"
## [1] "Germany & Israel"
## [1] "Brazil & Palestine"
## [1] "Malaysia & Vietnam"
## [1] "China & Malaysia"
## [1] "Bolivia & Gabon"
## [1] "Paraguay & Tajikistan"
## [1] "Faroe Islands & Svalbard and Jan Mayen"
## [1] "Malaysia & Ukraine"
## [1] "Argentina & Brazil"
## [1] "Serbia & Turkmenistan"
Export vectors to files:
for(c in countries) {
x = part.wrt.all[c.protagonist == c]
f = paste(c("~/galean/scripts/queries/data/interest_vectors/", c, ".txt"), collapse = "")
write.table(x, f, row.names = F, sep = "\t")
}
knn.int.cos <- data.table(read.table('~/galean/scripts/queries/data/knn_int_cos.txt',
header=F, sep='\t',
stringsAsFactors = T, na.strings = ''))
setnames(knn.int.cos, c("V1", "V2", "V3"), c("c1", "c2", "dist.cos"))
one_nn <- knn.int.cos[dist.cos != 1]
one_nn <- one_nn[one_nn[, .I[dist.cos == max(dist.cos)], by=c1]$V1]
one_nn <- na.omit(one_nn[order(-dist.cos)])
one_nn.table <- one_nn[, list(map_country(c1), map_country(c2), dist.cos)]
one_nn.table <- one_nn.table[order(-dist.cos)]
kable(one_nn.table[1:30]) #, format="latex", booktabs=T)
V1 | V2 | dist.cos |
---|---|---|
Solomon Islands | Hungary | 0.9999 |
Solomon Islands | Dominica | 0.9999 |
Solomon Islands | Bulgaria | 0.9999 |
Solomon Islands | Faroe Islands | 0.9999 |
Solomon Islands | Puerto Rico | 0.9999 |
Solomon Islands | Estonia | 0.9999 |
Solomon Islands | Republic of the Congo | 0.9999 |
Solomon Islands | Svalbard and Jan Mayen | 0.9999 |
Solomon Islands | Ivory Coast | 0.9999 |
Solomon Islands | Montenegro | 0.9999 |
Gabon | NA | 0.9977 |
Bolivia | NA | 0.9976 |
Bhutan | Montenegro | 0.9975 |
Bhutan | Maldives | 0.9975 |
Fiji | Syria | 0.9945 |
Gambia | Jordan | 0.9917 |
Togo | Dominica | 0.9892 |
Poland | Afghanistan | 0.9869 |
Vanuatu | Haiti | 0.9825 |
Lesotho | Slovakia | 0.9799 |
Cook Islands | Belize | 0.9773 |
Mongolia | Belize | 0.9773 |
Greenland | NA | 0.9769 |
Cocos [Keeling] Islands | NA | 0.9758 |
Ecuador | Lithuania | 0.9752 |
East Timor | Haiti | 0.9750 |
Burundi | Belgium | 0.9745 |
Somalia | Finland | 0.9704 |
Macedonia | New Zealand | 0.9675 |
Sierra Leone | Nigeria | 0.9673 |
knn.int.cos.pl <- knn.int.cos[order(dist.cos)]
knn.int.cos.pl <- knn.int.cos.pl[dist.cos < .5 & dist.cos > 0]
ggplot(knn.int.cos.pl, aes(x=reorder(c1, dist.cos), y=reorder(c2, dist.cos),
fill="white", label=as.character(round(dist.cos, 2)))) +
geom_tile(size=.1, color="white") + fte_theme() +
geom_text(size=2, color=palette[9]) +
scale_x_discrete(labels=map_country) + scale_y_discrete(labels=map_country) +
theme(axis.text.x=element_text(size=7, angle=45, hjust=1),
axis.text.y=element_text(size=7)) +
labs(x="Country", y="Country")
quantile(part.frac[, interest], c(.5, .75, .8, .9, .99))
## 50% 75% 80% 90% 99%
## 0.008673027 0.026315789 0.034852547 0.070422535 0.319908521
part.median_interest <- part.frac[interest > median(interest)]
part.countries_by_event <- part.median_interest[, max(.N), by=component_id]
Events with maximum number of countries interested (and with interest > median interest):
kable(part.countries_by_event[order(-V1)][1:10],
col.names=c("component_id", "no of countries"))
component_id | no of countries |
---|---|
15945 | 180 |
14355 | 160 |
18989 | 160 |
13187 | 155 |
2878 | 154 |
24460 | 154 |
12321 | 153 |
298 | 151 |
11432 | 150 |
14577 | 150 |
Even with no filtering (interest > 0
), the resulting list is unchanged. That means, the events with most countries interested produced high interest from every country (at least in top 50%).
Measure the interest of a country as the percentage of all tweets issued from that country.
part.filtered <- part.tidy[tweets > 0]
setkey(part.filtered, component_id)
part.frac.all <- part.filtered[, list(component_id, tweets, lapply(.SD, function(x) as.numeric(x / sum(x)))$tweets), by=country]
setnames(part.frac.all, "V3", "interest")
Countries with largest IQR of interest:
kable(part.frac.all[, IQR(interest), by=country][order(-V1)][1:30])
country | V1 |
---|---|
MC | 0.0033784 |
NU | 0.0033333 |
GS | 0.0021097 |
VA | 0.0020270 |
MO | 0.0018904 |
IO | 0.0017123 |
SZ | 0.0014286 |
SR | 0.0013280 |
SL | 0.0010718 |
KG | 0.0009416 |
AG | 0.0007955 |
BN | 0.0007794 |
CR | 0.0007452 |
CG | 0.0006998 |
ER | 0.0006840 |
GW | 0.0006566 |
MP | 0.0006532 |
LS | 0.0006184 |
SV | 0.0005797 |
GU | 0.0005376 |
UZ | 0.0005280 |
VI | 0.0005198 |
GA | 0.0004647 |
LR | 0.0004505 |
LI | 0.0004446 |
MN | 0.0004286 |
FJ | 0.0004186 |
LU | 0.0003748 |
MU | 0.0003733 |
MQ | 0.0003546 |
selected_countries <- c("US", "GB", "UA", "RU", "BR", "CL", "SY", "IQ", "IL", "PS")
p <- ggplot(part.frac.all[country %in% selected_countries], aes(x=country, y=interest)) +
geom_boxplot() + fte_theme() + scale_x_discrete(labels=map_country) +
xlab("") + ylab("Interest")
print(p + ggtitle("Distribution of interest of selected countries"))
ggsave(paste(c(PDF_PATH, "interest-all-dist.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "interest-all-dist.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
part.wrt.all <- part.frac.all %>%
full_join(prot, by=c("component_id")) %>%
filter(!is.na(country_code)) %>%
mutate(c.interested=country,
c.protagonist=country_code) %>%
select(component_id, c.interested, c.protagonist, interest) %>%
group_by(c.interested, c.protagonist) %>%
summarise(avg.interest=mean(interest))
part.wrt.all <- data.table(na.omit(part.wrt.all))
selected_pairs = list(c("RU", "UA"), c("IL", "PS"), c("BR", "DE"),
c("GB", "US"), c("BR", "US"), c("DE", "IL"),
c("BR", "PS"), c("MY", "VN"), c("CN", "MY"),
c("BO", "GA"), c("PY", "TJ"), c("FO", "SJ"),
c("MY", "UA"), c("AR", "BR"), c("CS", "TM"))
for(pair in selected_pairs) {
print(paste(map_country(pair), collapse = " & "))
tmp <- part.wrt.all[c.protagonist %in% pair]
p <- ggplot(tmp,
aes(x=reorder(c.interested, avg.interest),
y=avg.interest,
color=factor(c.protagonist, levels=pair),
shape=factor(c.protagonist, levels=pair))) +
geom_point(size=1, position="jitter", alpha=1) +
# geom_smooth(aes(x=reorder(c.interested, avg.interest), y=avg.interest, group=c.protagonist), size=.5) +
xlab("Interested countries") +
ylab("Interest") +
fte_theme() + theme(axis.text.x = element_blank()) +
theme(legend.justification=c(0, 1), legend.position=c(0,1),
legend.background = element_rect(fill="transparent"),
legend.key.height=unit(1.8,"line"),
axis.text=element_text(size=6)) +
scale_y_continuous(breaks=seq(0, 1, by=0.1), limits=c(0, 1.1)) +
scale_color_discrete(name="Protagonist Country",
breaks=pair,
labels=map_country(pair)) +
scale_shape_discrete(name="Protagonist Country",
breaks=pair,
solid=F,
labels=map_country(pair))
# scale_x_discrete(labels=map_country)
print(p + ggtitle(paste(map_country(pair), collapse = " & ")))
ggsave(paste(c(PDF_PATH, paste(c("int-prot-all", pair), collapse = "-"), ".eps"),
collapse = ""),
p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, paste(c("int-prot-all", pair), collapse = "-"), ".pdf"),
collapse = ""),
p, dpi=pl.DPI, width=pl.W, height=pl.H)
}
## [1] "Russia & Ukraine"
## [1] "Israel & Palestine"
## [1] "Brazil & Germany"
## [1] "Great Britain & United States"
## [1] "Brazil & United States"
## [1] "Germany & Israel"
## [1] "Brazil & Palestine"
## [1] "Malaysia & Vietnam"
## [1] "China & Malaysia"
## [1] "Bolivia & Gabon"
## [1] "Paraguay & Tajikistan"
## [1] "Faroe Islands & Svalbard and Jan Mayen"
## [1] "Malaysia & Ukraine"
## [1] "Argentina & Brazil"
## [1] "Serbia & Turkmenistan"
Export vectors to files:
countries <- unique(part.wrt.all$c.protagonist)
for(c in countries) {
x = part.wrt.all[c.protagonist == c]
f = paste(c("~/galean/scripts/queries/data/interest_vectors/", c, ".txt"), collapse = "")
write.table(x, f, row.names = F, sep = "\t")
}
knn.int.cos <- data.table(read.table('~/galean/scripts/queries/data/knn_int_cos2.txt',
header=F, sep='\t',
stringsAsFactors = T, na.strings = ''))
setnames(knn.int.cos, c("V1", "V2", "V3"), c("c1", "c2", "dist.cos"))
one_nn <- knn.int.cos[dist.cos != 1]
one_nn <- one_nn[one_nn[, .I[dist.cos == max(dist.cos)], by=c1]$V1]
one_nn <- na.omit(one_nn[order(-dist.cos)])
one_nn.table <- one_nn[, list(map_country(c1), map_country(c2), dist.cos)]
one_nn.table <- one_nn.table[order(-dist.cos)]
kable(one_nn.table[1:30]) #, format="latex", booktabs=T)
V1 | V2 | dist.cos |
---|---|---|
Cuba | United States | 0.9995 |
United States | Cuba | 0.9995 |
Costa Rica | Uruguay | 0.9994 |
Uruguay | Costa Rica | 0.9994 |
Argentina | Brazil | 0.9993 |
Brazil | Argentina | 0.9993 |
Bosnia and Herzegovina | Honduras | 0.9992 |
Honduras | Bosnia and Herzegovina | 0.9992 |
Honduras | Croatia | 0.9992 |
Croatia | Costa Rica | 0.9992 |
Croatia | Honduras | 0.9992 |
Denmark | Russia | 0.9991 |
Spain | Argentina | 0.9991 |
Iraq | North Korea | 0.9991 |
North Korea | Iraq | 0.9991 |
Russia | Denmark | 0.9991 |
Libya | Cuba | 0.9990 |
Mexico | Peru | 0.9990 |
Peru | Mexico | 0.9990 |
Taiwan | Denmark | 0.9989 |
Vietnam | Denmark | 0.9989 |
Botswana | United States | 0.9988 |
Israel | Iraq | 0.9988 |
Aruba | Libya | 0.9987 |
France | Italy | 0.9987 |
Guyana | Papua | 0.9987 |
Italy | France | 0.9987 |
Papua | Guyana | 0.9987 |
Democratic Republic of the Congo | Colombia | 0.9986 |
Colombia | Democratic Republic of the Congo | 0.9986 |
ggplot(one_nn[1:30], aes(x=reorder(c1, -dist.cos), y=reorder(c2, -dist.cos),
fill=dist.cos, label=as.character(round(dist.cos, 2)))) +
geom_tile(size=.1) + fte_theme() +
geom_text(size=2, color=palette[1]) +
scale_x_discrete(labels=map_country) + scale_y_discrete(labels=map_country) +
theme(axis.text.x=element_text(size=7, angle=45, hjust=1),
axis.text.y=element_text(size=7)) +
labs(x="Country", y="Country")
part.wrt.all <- part.frac.all %>%
full_join(prot, by=c("component_id")) %>%
filter(!is.na(country_code)) %>%
mutate(c.interested=country,
c.protagonist=country_code) %>%
select(component_id, c.interested, c.protagonist, interest) %>%
group_by(c.interested, c.protagonist) %>%
summarise(avg.interest=mean(interest))
part.wrt.all <- data.table(na.omit(part.wrt.all))
selected_pairs = list(c("FO", "SJ"), c("MY", "UA"), c("FJ", "VU"),
c("RS", "TM"), c("CN", "DE"), c("PY", "TJ"),
c("NL", "MY"), c("BO", "TL"), c("GA", "TL"),
c("AU", "DE"), c("PL", "TR"), c("IN", "AU"),
c("AT", "IQ"), c("KE", "YE"), c("AM", "VA"))
for(pair in selected_pairs) {
print(paste(map_country(pair), collapse = " & "))
tmp <- part.wrt.all[c.protagonist %in% pair]
p <- ggplot(tmp,
aes(x=reorder(c.interested, avg.interest),
y=avg.interest,
color=factor(c.protagonist, levels=pair),
shape=factor(c.protagonist, levels=pair))) +
geom_point(size=1, position="jitter", alpha=1) +
# geom_smooth(aes(x=reorder(c.interested, avg.interest), y=avg.interest, group=c.protagonist), size=.5) +
xlab("Interested countries") +
ylab("Interest") +
fte_theme() + theme(axis.text.x = element_blank()) +
theme(legend.justification=c(0, 1), legend.position=c(0,1),
legend.background = element_rect(fill="transparent"),
legend.key.height=unit(1.8,"line"),
axis.text=element_text(size=6)) +
scale_y_continuous(breaks=seq(0, 1, by=0.1), limits=c(0, 1.1)) +
scale_color_discrete(name="Protagonist Country",
breaks=pair,
labels=map_country(pair)) +
scale_shape_discrete(name="Protagonist Country",
breaks=pair,
solid=F,
labels=map_country(pair))
# scale_x_discrete(labels=map_country)
print(p + ggtitle(paste(map_country(pair), collapse = " & ")))
ggsave(paste(c(PDF_PATH, paste(c("int-prot-all", pair), collapse = "-"), ".eps"),
collapse = ""),
p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, paste(c("int-prot-all", pair), collapse = "-"), ".pdf"),
collapse = ""),
p, dpi=pl.DPI, width=pl.W, height=pl.H)
}
## [1] "Faroe Islands & Svalbard and Jan Mayen"
## [1] "Malaysia & Ukraine"
## [1] "Fiji & Vanuatu"
## [1] "Serbia & Turkmenistan"
## [1] "China & Germany"
## [1] "Paraguay & Tajikistan"
## [1] "Netherlands & Malaysia"
## [1] "Bolivia & East Timor"
## [1] "Gabon & East Timor"
## [1] "Australia & Germany"
## [1] "Poland & Turkey"
## [1] "India & Australia"
## [1] "Austria & Iraq"
## [1] "Kenya & Yemen"
## [1] "Armenia & Vatican City"
summary(part.frac.all[, interest])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000001 0.0000181 0.0000542 0.0002377 0.0001670 0.5000000
quantile(part.frac.all[, interest], c(.5, .75, .8, .9, .99))
## 50% 75% 80% 90% 99%
## 5.424464e-05 1.669728e-04 2.135535e-04 4.446421e-04 2.701790e-03
part.median_interest <- part.frac.all[interest > 2.701790e-03]
part.countries_by_event <- part.median_interest[, max(.N), by=component_id]
Events with maximum number of countries interested (and with interest > median interest):
kable(part.countries_by_event[order(-V1)][1:10],
col.names=c("component_id", "no of countries"))
component_id | no of countries |
---|---|
15945 | 202 |
298 | 153 |
14355 | 144 |
12321 | 143 |
18989 | 130 |
2878 | 127 |
13187 | 106 |
17406 | 102 |
19427 | 97 |
24460 | 97 |
events <- c("Death of actor Robin Williams.",
"2014 FIFA World Cup final between Germany and Argentina.",
"2014 FIFA World Cup starts.",
"2015 Super Bowl starts.",
"New Year's Eve 2013",
"Soccer Player Luis Suarez is suspended from 2014 World Cup.",
"Charlie Hebdo shooting in Paris.",
"2015 Grammy Awards.",
"Professional boxing match between Floyd Mayweather and Manny Pacquiao.")
dates <- c("2014-08-12",
"2014-07-13",
"2014-06-12",
"2015-02-02",
"2013-12-31",
"2014-06-26",
"2015-01-07",
"2015-02-09",
"2015-05-03")
countries_affected <- c(202, 144, 143, 130, 127, 106, 102, 97, 97)
hi_events <- data.table(Description=events, Date=dates, Countries=countries_affected)
kable(hi_events, format="latex", booktabs=T)
\begin{tabular}{llr}
\toprule
Description & Date & Countries\\
\midrule
Death of actor Robin Williams. & 2014-08-12 & 202\\
2014 FIFA World Cup final between Germany and Argentina. & 2014-07-13 & 144\\
2014 FIFA World Cup starts. & 2014-06-12 & 143\\
2015 Super Bowl starts. & 2015-02-02 & 130\\
New Year's Eve 2013 & 2013-12-31 & 127\\
\addlinespace
Soccer Player Luis Suarez is suspended from 2014 World Cup. & 2014-06-26 & 106\\
Charlie Hebdo shooting in Paris. & 2015-01-07 & 102\\
2015 Grammy Awards. & 2015-02-09 & 97\\
Professional boxing match between Floyd Mayweather and Manny Pacquiao. & 2015-05-03 & 97\\
\bottomrule
\end{tabular}
part.wrt.all <- part.frac.all %>%
full_join(prot, by=c("component_id")) %>%
filter(!is.na(country_code)) %>%
mutate(c.interested=country,
c.protagonist=country_code) %>%
select(component_id, c.interested, c.protagonist, interest)
part.wrt.all <- na.omit(data.table(part.wrt.all))
cs <- c()
si <- c()
for(cnt in countries_list$country.code) {
cs <- c(cs, cnt)
si <- c(si, part.wrt.all[c.protagonist == cnt & c.interested == cnt][, sum(interest)])
}
part.self <- data.table(c.interested=cs, self.interest=si)
# part.self <- part.self[order(-self.interest)][, list(country=map_country(country),
# self.interest)]
part.self <- part.wrt.all[, .N, by=c.interested] %>% left_join(part.self, "c.interested")
part.self <- part.self[order(-self.interest)]
p <- ggplot(part.self[1:15], aes(x=reorder(c.interested, self.interest),
y=self.interest,
label=comma(N))) +
geom_bar(stat="identity") + coord_flip() + geom_text(size=2.5, hjust=-.2, color=palette[6]) +
scale_y_continuous(labels=comma, breaks=seq(0, 1, by=0.1), limits = c(0, 1)) +
scale_x_discrete(labels=map_country) +
fte_theme() + geom_hline(yintercept=0, size=0.4, color="black") +
xlab("Country") + ylab("Interest")
print(p + ggtitle("Interest of countries in events protagonized by them"))
ggsave(paste(c(PDF_PATH, "self-interest.eps"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, "self-interest.pdf"), collapse = ""), p, dpi=pl.DPI, width=pl.W, height=pl.H)
part.tidy2 <- part.tidy[tweets > 0]
part.wrt.all <- part.tidy2 %>%
full_join(prot, by="component_id") %>%
mutate(c.interested = country, c.protagonist = country_code) %>%
select(component_id, c.protagonist, c.interested, tweets) %>%
group_by(c.interested, c.protagonist) %>%
summarise(events=length(component_id))
(part.wrt.all <- data.table(na.omit(part.wrt.all)))
## c.interested c.protagonist events
## 1: AD AE 4
## 2: AD AF 5
## 3: AD AR 4
## 4: AD AU 22
## 5: AD BB 1
## ---
## 31679: ZW XK 1
## 31680: ZW YE 68
## 31681: ZW ZA 124
## 31682: ZW ZM 1
## 31683: ZW ZW 27
# part.wrt.all <- part.wrt.all[, .SD[sum(events) > 50],
# by=c.interested][, .SD[sum(events) > 50],
# by=c.protagonist]
part.wrt.all[, sc.events:=scale(events), by=c.interested]
# tmp2[, sc.events:=scale(events), by=c.interested]
selected_pairs = list(c("UA", "RU"), c("IL", "PS"), c("BR", "DE"),
c("US", "GB"))
for(pair in selected_pairs) {
print(paste(map_country(pair), collapse = " & "))
tmp <- part.wrt.all[c.protagonist %in% pair]
p <- ggplot(tmp,
aes(x=reorder(c.interested, sc.events),
y=sc.events,
color=factor(c.protagonist, levels=pair),
shape=factor(c.protagonist, levels=pair))) +
geom_point(size=1, position="jitter", alpha=1) +
geom_smooth(aes(group=c.protagonist)) +
# geom_line(aes(group=c.protagonist)) +
# geom_smooth(aes(x=reorder(c.interested, avg.interest), y=avg.interest, group=c.protagonist), size=.5) +
xlab("Interested countries") +
ylab("Interest") +
fte_theme() + theme(axis.text.x = element_blank()) +
theme(legend.justification=c(0, 1), legend.position=c(0,1),
legend.background = element_rect(fill="transparent"),
legend.key.height=unit(1.8,"line"),
axis.text=element_text(size=6)) +
# scale_y_continuous(breaks=seq(0, 1, by=0.1), limits=c(0, 1.1)) +
scale_color_discrete(name="Protagonist Country",
breaks=pair,
labels=map_country(pair)) +
scale_shape_discrete(name="Protagonist Country",
breaks=pair,
solid=F,
labels=map_country(pair))
# scale_x_discrete(labels=map_country)
print(p + ggtitle(paste(map_country(pair), collapse = " & ")))
ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".eps"), collapse = ""),
p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".pdf"), collapse = ""),
p, dpi=pl.DPI, width=pl.W, height=pl.H)
}
## [1] "Ukraine & Russia"
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## [1] "Israel & Palestine"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## [1] "Brazil & Germany"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## [1] "United States & Great Britain"
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
countries <- unique(part.wrt.all$c.protagonist)
for(c in countries) {
x = part.wrt.all[c.protagonist == c]
f = paste(c("~/galean/scripts/queries/data/interest_vectors/", c, ".txt"), collapse = "")
write.table(x, f, row.names = F, sep = "\t")
}
knn.int.cos <- data.table(read.table('~/galean/scripts/queries/data/knn_int_cos_scale.txt',
header=F, sep='\t',
stringsAsFactors = T, na.strings = ''))
setnames(knn.int.cos, c("V1", "V2", "V3"), c("c1", "c2", "dist.cos"))
one_nn <- knn.int.cos[dist.cos != 1]
one_nn <- one_nn[one_nn[, .I[dist.cos == max(dist.cos)], by=c1]$V1]
one_nn <- na.omit(one_nn[order(-dist.cos)])
one_nn.table <- one_nn[, list(map_country(c1), map_country(c2), dist.cos)]
one_nn.table <- one_nn.table[order(-dist.cos)]
kable(one_nn.table[1:30]) #, format="latex", booktabs=T)
V1 | V2 | dist.cos |
---|---|---|
Australia | Great Britain | 0.9828 |
Great Britain | Australia | 0.9828 |
United States | Great Britain | 0.9803 |
Algeria | Sri Lanka | 0.9766 |
Sri Lanka | Algeria | 0.9766 |
Russia | Ukraine | 0.9728 |
Ukraine | Russia | 0.9728 |
Canada | United States | 0.9721 |
Bosnia and Herzegovina | Honduras | 0.9714 |
Honduras | Bosnia and Herzegovina | 0.9714 |
Croatia | Bosnia and Herzegovina | 0.9713 |
France | Great Britain | 0.9708 |
Taiwan | Bosnia and Herzegovina | 0.9705 |
Niger | Bosnia and Herzegovina | 0.9686 |
Finland | Croatia | 0.9684 |
Costa Rica | Sweden | 0.9671 |
Sweden | Costa Rica | 0.9671 |
Estonia | Kazakhstan | 0.9667 |
Kazakhstan | Estonia | 0.9667 |
Denmark | Croatia | 0.9665 |
Uruguay | Taiwan | 0.9660 |
Zimbabwe | Croatia | 0.9650 |
Ivory Coast | Croatia | 0.9638 |
Ghana | Taiwan | 0.9623 |
China | Australia | 0.9608 |
Singapore | Taiwan | 0.9606 |
Chile | Taiwan | 0.9602 |
Cameroon | Niger | 0.9564 |
Norway | Croatia | 0.9563 |
Guinea | Mali | 0.9561 |
knn.int.cos[c1 %in% c("BR", "DE", "UA", "RU", "IL") & c2 %in% c("BR", "DE", "UA", "RU", "PS")][dist.cos != 1][order(-dist.cos)]
## c1 c2 dist.cos
## 1: RU UA 0.9728
## 2: UA RU 0.9728
## 3: DE RU 0.9256
## 4: RU DE 0.9256
## 5: IL RU 0.8964
## 6: IL UA 0.8879
## 7: DE UA 0.8733
## 8: UA DE 0.8733
## 9: IL PS 0.8650
## 10: IL DE 0.8543
## 11: BR DE 0.7663
## 12: DE BR 0.7663
## 13: UA PS 0.7189
## 14: RU PS 0.6948
## 15: DE PS 0.6401
## 16: BR RU 0.6239
## 17: RU BR 0.6239
## 18: IL BR 0.5985
## 19: BR UA 0.5605
## 20: UA BR 0.5605
## 21: BR PS 0.3527
## c1 c2 dist.cos
knn.int.cos.pl <- knn.int.cos[order(-dist.cos)]
knn.int.cos.pl <- knn.int.cos.pl[dist.cos > .95 & dist.cos < 1]
ggplot(knn.int.cos.pl, aes(x=reorder(c1, -dist.cos), y=reorder(c2, -dist.cos),
fill="white", label=as.character(round(dist.cos, 2)))) +
geom_tile(size=.1, color="white") + fte_theme() +
geom_text(size=2, color=palette[9]) +
scale_x_discrete(labels=map_country) + scale_y_discrete(labels=map_country) +
theme(axis.text.x=element_text(size=7, angle=45, hjust=1),
axis.text.y=element_text(size=7)) +
labs(x="Country", y="Country")
selected_pairs = list(c("AU", "GB"), c("US", "GB"), c("DZ", "LK"),
c("RU", "UA"), c("CA", "US"))
for(pair in selected_pairs) {
print(paste(map_country(pair), collapse = " & "))
tmp <- part.wrt.all[c.protagonist %in% pair]
p <- ggplot(tmp,
aes(x=reorder(c.interested, sc.events),
y=sc.events,
color=factor(c.protagonist, levels=pair),
shape=factor(c.protagonist, levels=pair))) +
geom_point(size=1, position="jitter", alpha=1) +
geom_smooth(aes(group=c.protagonist), size=.5) +
# geom_line(aes(group=c.protagonist)) +
# geom_smooth(aes(x=reorder(c.interested, avg.interest), y=avg.interest, group=c.protagonist), size=.5) +
xlab("Interested countries") +
ylab("Interest") +
fte_theme() + theme(axis.text.x = element_blank()) +
theme(legend.justification=c(0, 1), legend.position=c(0,1),
legend.background = element_rect(fill="transparent"),
legend.key.height=unit(1.8,"line"),
axis.text=element_text(size=6)) +
# scale_y_continuous(breaks=seq(0, 1, by=0.1), limits=c(0, 1.1)) +
scale_color_discrete(name="Protagonist Country",
breaks=pair,
labels=map_country(pair)) +
scale_shape_discrete(name="Protagonist Country",
breaks=pair,
solid=F,
labels=map_country(pair))
# scale_x_discrete(labels=map_country)
print(p + ggtitle(paste(map_country(pair), collapse = " & ")))
ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".eps"), collapse = ""),
p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".pdf"), collapse = ""),
p, dpi=pl.DPI, width=pl.W, height=pl.H)
}
## [1] "Australia & Great Britain"
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## [1] "United States & Great Britain"
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
## [1] "Algeria & Sri Lanka"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## [1] "Russia & Ukraine"
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## [1] "Canada & United States"
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
knn.int.euc <- data.table(read.table('~/galean/scripts/queries/data/knn_int_euc_scale.txt',
header=F, sep='\t',
stringsAsFactors = T, na.strings = ''))
setnames(knn.int.euc, c("V1", "V2", "V3"), c("c1", "c2", "dist.euc"))
one_nn <- knn.int.euc[dist.euc != 0]
one_nn <- one_nn[one_nn[, .I[dist.euc == min(dist.euc)], by=c1]$V1]
one_nn <- na.omit(one_nn[order(dist.euc)])
setnames(one_nn, c("c1"), c("country_code"))
tmp <- one_nn %>% left_join(prot.by_country, "country_code")
setnames(tmp, c("country_code", "c2"), c("c1", "country_code"))
tmp <- tmp %>% left_join(prot.by_country, "country_code")
setnames(tmp, c("country_code"), c("c2"))
tmp <- tmp[, list(c1 = map_country(c1), c2 = map_country(c2), N.x, N.y, dist.euc)]
tmp <- tmp[order(dist.euc / (N.x * N.y))]
kable(tmp[N.x >= 166.41 & N.y >= 166.41][order(dist.euc)])#, format="latex", booktabs=T)
c1 | c2 | N.x | N.y | dist.euc |
---|---|---|---|---|
Turkey | Indonesia | 198 | 172 | 1.1442 |
Yemen | Turkey | 202 | 198 | 1.3416 |
Afghanistan | Turkey | 323 | 198 | 1.5304 |
Libya | Turkey | 253 | 198 | 1.6050 |
Palestine | Egypt | 360 | 316 | 1.6496 |
Egypt | Palestine | 316 | 360 | 1.6496 |
Malaysia | Turkey | 262 | 198 | 1.8096 |
Japan | Spain | 354 | 258 | 1.8327 |
Spain | Japan | 258 | 354 | 1.8327 |
Italy | Japan | 315 | 354 | 1.9018 |
Brazil | Spain | 236 | 258 | 1.9060 |
Pakistan | Germany | 453 | 371 | 2.0674 |
Germany | Pakistan | 371 | 453 | 2.0674 |
Syria | Israel | 647 | 561 | 2.4463 |
Israel | Syria | 561 | 647 | 2.4463 |
Ukraine | Russia | 921 | 823 | 2.5557 |
Russia | Ukraine | 823 | 921 | 2.5557 |
Nigeria | Pakistan | 412 | 453 | 2.5822 |
China | Canada | 646 | 715 | 2.6025 |
Canada | China | 715 | 646 | 2.6025 |
Iran | Syria | 496 | 647 | 2.6838 |
Iraq | Iran | 654 | 496 | 2.9270 |
France | Canada | 627 | 715 | 3.7859 |
Australia | France | 974 | 627 | 4.1398 |
India | Australia | 1561 | 974 | 4.8339 |
Great Britain | India | 4015 | 1561 | 41.7719 |
United States | Great Britain | 10162 | 4015 | 97.2733 |
# one_nn.table <- one_nn[, list(map_country(c1), map_country(c2), dist.euc)]
# one_nn.table <- one_nn.table[order(dist.euc)]
#
# kable(one_nn.table[1:30]) #, format="latex", booktabs=T)
knn.int.euc[c1 %in% c("BR", "DE", "UA", "RU", "IL") & c2 %in% c("BR", "DE", "UA", "RU", "PS")][dist.euc != 0][order(dist.euc)]
## c1 c2 dist.euc
## 1: RU UA 2.5557
## 2: UA RU 2.5557
## 3: BR DE 2.8704
## 4: DE BR 2.8704
## 5: BR PS 3.2902
## 6: DE PS 3.3507
## 7: IL DE 4.1893
## 8: IL RU 4.2992
## 9: IL PS 4.8173
## 10: IL UA 5.2256
## 11: DE RU 5.5783
## 12: RU DE 5.5783
## 13: IL BR 6.0385
## 14: DE UA 6.9912
## 15: UA DE 6.9912
## 16: RU PS 7.5344
## 17: BR RU 7.9699
## 18: RU BR 7.9699
## 19: UA PS 8.5056
## 20: BR UA 9.2345
## 21: UA BR 9.2345
## c1 c2 dist.euc
knn.int.cos.pl <- knn.int.cos[order(dist.cos)]
knn.int.cos.pl <- knn.int.cos.pl[dist.cos < .5 & dist.cos > 0]
ggplot(knn.int.cos.pl, aes(x=reorder(c1, dist.cos), y=reorder(c2, dist.cos),
fill="white", label=as.character(round(dist.cos, 2)))) +
geom_tile(size=.1, color="white") + fte_theme() +
geom_text(size=2, color=palette[9]) +
scale_x_discrete(labels=map_country) + scale_y_discrete(labels=map_country) +
theme(axis.text.x=element_text(size=7, angle=45, hjust=1),
axis.text.y=element_text(size=7)) +
labs(x="Country", y="Country")
selected_pairs = list(c("AF", "TR"), c("PK", "IR"), c("EG", "AF"),
c("JP", "IR"), c("MY", "PK"), c("BR", "DE"),
c("US", "GB"), c("IL", "PS"), c("UA", "RU"),
c("KP", "KR"), c("ID", "TR"), c("YE", "TR"))
for(pair in selected_pairs) {
print(paste(map_country(pair), collapse = " & "))
tmp <- part.wrt.all[c.protagonist %in% pair]
p <- ggplot(tmp,
aes(x=reorder(c.interested, sc.events),
y=sc.events,
color=factor(c.protagonist, levels=pair),
shape=factor(c.protagonist, levels=pair))) +
geom_point(size=1, position="jitter", alpha=1) +
geom_smooth(aes(group=c.protagonist), size=.5) +
# geom_line(aes(group=c.protagonist)) +
# geom_smooth(aes(x=reorder(c.interested, avg.interest), y=avg.interest, group=c.protagonist), size=.5) +
xlab("Interested countries") +
ylab("Standard deviations from the mean") +
fte_theme() + theme(axis.text.x = element_blank()) +
theme(legend.justification=c(0, 1), legend.position=c(0,1),
legend.background = element_rect(fill="transparent"),
legend.key.height=unit(1.8,"line"),
axis.text=element_text(size=6)) +
scale_y_continuous(breaks=seq(-.5, 2, by=0.25), limits=c(-.5, 2)) +
scale_color_discrete(name="Protagonist Country",
breaks=pair,
labels=map_country(pair)) +
scale_shape_discrete(name="Protagonist Country",
breaks=pair,
solid=F,
labels=map_country(pair))
# scale_x_discrete(labels=map_country)
print(p + ggtitle(paste(map_country(pair), collapse = " & ")))
ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".eps"), collapse = ""),
p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".pdf"), collapse = ""),
p, dpi=pl.DPI, width=pl.W-2, height=pl.H)
}
## [1] "Afghanistan & Turkey"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## [1] "Pakistan & Iran"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## [1] "Egypt & Afghanistan"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## [1] "Japan & Iran"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## [1] "Malaysia & Pakistan"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## [1] "Brazil & Germany"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## [1] "United States & Great Britain"
## Warning: Removed 446 rows containing non-finite values (stat_smooth).
## Warning: Removed 447 rows containing missing values (geom_point).
## Warning: Removed 446 rows containing non-finite values (stat_smooth).
## Warning: Removed 446 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## Warning: Removed 446 rows containing non-finite values (stat_smooth).
## Warning: Removed 446 rows containing missing values (geom_point).
## [1] "Israel & Palestine"
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
## [1] "Ukraine & Russia"
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 5 rows containing missing values (geom_point).
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 5 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 5 rows containing missing values (geom_point).
## [1] "North Korea & South Korea"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## [1] "Indonesia & Turkey"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## [1] "Yemen & Turkey"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
knn.int.abs <- data.table(read.table('~/galean/scripts/queries/data/knn_int_abs_scale.txt',
header=F, sep='\t',
stringsAsFactors = T, na.strings = ''))
setnames(knn.int.abs, c("V1", "V2", "V3"), c("c1", "c2", "dist.abs"))
one_nn <- knn.int.abs[dist.abs != 0]
one_nn <- one_nn[one_nn[, .I[dist.abs == min(dist.abs)], by=c1]$V1]
one_nn <- na.omit(one_nn[order(dist.abs)])
setnames(one_nn, c("c1"), c("country_code"))
tmp <- one_nn %>% left_join(prot.by_country, "country_code")
setnames(tmp, c("country_code", "c2"), c("c1", "country_code"))
tmp <- tmp %>% left_join(prot.by_country, "country_code")
setnames(tmp, c("country_code"), c("c2"))
tmp <- tmp[, list(c1 = map_country(c1), c2 = map_country(c2), N.x, N.y, dist.abs)]
tmp <- tmp[order(dist.abs / (N.x * N.y))]
kable(tmp[N.x + N.y >= 500][order(dist.abs)])
c1 | c2 | N.x | N.y | dist.abs |
---|---|---|---|---|
Libya | Afghanistan | 253 | 323 | 12.8633 |
Afghanistan | Libya | 323 | 253 | 12.8633 |
Palestine | Egypt | 360 | 316 | 14.5588 |
Egypt | Palestine | 316 | 360 | 14.5588 |
Japan | Italy | 354 | 315 | 14.8907 |
Italy | Japan | 315 | 354 | 14.8907 |
Malaysia | Afghanistan | 262 | 323 | 16.0362 |
Spain | Italy | 258 | 315 | 16.3063 |
Pakistan | Germany | 453 | 371 | 19.6107 |
Germany | Pakistan | 371 | 453 | 19.6107 |
Syria | Israel | 647 | 561 | 21.4050 |
Israel | Syria | 561 | 647 | 21.4050 |
Iran | Israel | 496 | 561 | 22.6966 |
Iraq | Syria | 654 | 647 | 23.1034 |
Ukraine | Russia | 921 | 823 | 23.4717 |
Russia | Ukraine | 823 | 921 | 23.4717 |
Nigeria | Pakistan | 412 | 453 | 23.7334 |
China | Canada | 646 | 715 | 25.6509 |
Canada | China | 715 | 646 | 25.6509 |
France | Canada | 627 | 715 | 38.3235 |
India | Australia | 1561 | 974 | 41.0575 |
Australia | India | 974 | 1561 | 41.0575 |
Great Britain | India | 4015 | 1561 | 553.0469 |
United States | Great Britain | 10162 | 4015 | 1314.0598 |
# knn.int.abs[c1 %in% c("BR", "DE", "UA", "RU", "IL") & c2 %in% c("BR", "DE", "UA", "RU", "PS")][dist.abs != 0][order(dist.abs)]
knn.int.cos.pl <- knn.int.cos[order(dist.cos)]
knn.int.cos.pl <- knn.int.cos.pl[dist.cos < .5 & dist.cos > 0]
ggplot(knn.int.cos.pl, aes(x=reorder(c1, dist.cos), y=reorder(c2, dist.cos),
fill="white", label=as.character(round(dist.cos, 2)))) +
geom_tile(size=.1, color="white") + fte_theme() +
geom_text(size=2, color=palette[9]) +
scale_x_discrete(labels=map_country) + scale_y_discrete(labels=map_country) +
theme(axis.text.x=element_text(size=7, angle=45, hjust=1),
axis.text.y=element_text(size=7)) +
labs(x="Country", y="Country")
selected_pairs = list(c("IT", "ES"), c("NG", "IT"), c("EG", "AF"),
c("JP", "IR"), c("MY", "PK"))
for(pair in selected_pairs) {
print(paste(map_country(pair), collapse = " & "))
tmp <- part.wrt.all[c.protagonist %in% pair]
p <- ggplot(tmp,
aes(x=reorder(c.interested, sc.events),
y=sc.events,
color=factor(c.protagonist, levels=pair),
shape=factor(c.protagonist, levels=pair))) +
geom_point(size=1, position="jitter", alpha=1) +
geom_smooth(aes(group=c.protagonist)) +
# geom_line(aes(group=c.protagonist)) +
# geom_smooth(aes(x=reorder(c.interested, avg.interest), y=avg.interest, group=c.protagonist), size=.5) +
xlab("Interested countries") +
ylab("Interest") +
fte_theme() + theme(axis.text.x = element_blank()) +
theme(legend.justification=c(0, 1), legend.position=c(0,1),
legend.background = element_rect(fill="transparent"),
legend.key.height=unit(1.8,"line"),
axis.text=element_text(size=6)) +
# scale_y_continuous(breaks=seq(0, 1, by=0.1), limits=c(0, 1.1)) +
scale_color_discrete(name="Protagonist Country",
breaks=pair,
labels=map_country(pair)) +
scale_shape_discrete(name="Protagonist Country",
breaks=pair,
solid=F,
labels=map_country(pair))
# scale_x_discrete(labels=map_country)
print(p + ggtitle(paste(map_country(pair), collapse = " & ")))
ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".eps"), collapse = ""),
p, dpi=pl.DPI, width=pl.W, height=pl.H)
ggsave(paste(c(PDF_PATH, paste(c("int-prot", pair), collapse = "-"), ".pdf"), collapse = ""),
p, dpi=pl.DPI, width=pl.W, height=pl.H)
}
## [1] "Italy & Spain"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## [1] "Nigeria & Italy"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## [1] "Egypt & Afghanistan"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## [1] "Japan & Iran"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
## [1] "Malaysia & Pakistan"
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
kable(countries_list[order(country.code)])
country.code | country.name |
---|---|
AD | Andorra |
AE | United Arab Emirates |
AF | Afghanistan |
AG | Antigua and Barbuda |
AI | Anguilla |
AL | Albania |
AM | Armenia |
AN | Netherlands Antilles |
AO | Angola |
AQ | Antarctica |
AR | Argentina |
AS | American Samoa |
AT | Austria |
AU | Australia |
AW | Aruba |
AX | Åland |
AZ | Azerbaijan |
BA | Bosnia and Herzegovina |
BB | Barbados |
BD | Bangladesh |
BE | Belgium |
BF | Burkina Faso |
BG | Bulgaria |
BH | Bahrain |
BI | Burundi |
BJ | Benin |
BL | Saint Barthélemy |
BM | Bermuda |
BN | Brunei |
BO | Bolivia |
BQ | Bonaire |
BR | Brazil |
BS | Bahamas |
BT | Bhutan |
BV | Bouvet Island |
BW | Botswana |
BY | Belarus |
BZ | Belize |
CA | Canada |
CC | Cocos [Keeling] Islands |
CD | Democratic Republic of the Congo |
CF | Central African Republic |
CG | Republic of the Congo |
CH | Switzerland |
CI | Ivory Coast |
CK | Cook Islands |
CL | Chile |
CM | Cameroon |
CN | China |
CO | Colombia |
CR | Costa Rica |
CS | Serbia |
CU | Cuba |
CV | Cape Verde |
CW | Curacao |
CX | Christmas Island |
CY | Cyprus |
CZ | Czechia |
DE | Germany |
DJ | Djibouti |
DK | Denmark |
DM | Dominica |
DO | Dominican Republic |
DZ | Algeria |
EC | Ecuador |
EE | Estonia |
EG | Egypt |
EH | Western Sahara |
ER | Eritrea |
ES | Spain |
ET | Ethiopia |
FI | Finland |
FJ | Fiji |
FK | Falkland Islands |
FM | Micronesia |
FO | Faroe Islands |
FR | France |
GA | Gabon |
GB | Great Britain |
GD | Grenada |
GE | Georgia |
GF | French Guiana |
GG | Guernsey |
GH | Ghana |
GI | Gibraltar |
GL | Greenland |
GM | Gambia |
GN | Guinea |
GP | Guadeloupe |
GQ | Equatorial Guinea |
GR | Greece |
GS | South Georgia and the South Sandwich Islands |
GT | Guatemala |
GU | Guam |
GW | Guinea-Bissau |
GY | Guyana |
HK | Hong Kong |
HM | Heard Island and McDonald Islands |
HN | Honduras |
HR | Croatia |
HT | Haiti |
HU | Hungary |
ID | Indonesia |
IE | Ireland |
IL | Israel |
IM | Isle of Man |
IN | India |
IO | British Indian Ocean Territory |
IQ | Iraq |
IR | Iran |
IS | Iceland |
IT | Italy |
JE | Jersey |
JM | Jamaica |
JO | Jordan |
JP | Japan |
KE | Kenya |
KG | Kyrgyzstan |
KH | Cambodia |
KI | Kiribati |
KM | Comoros |
KN | Saint Kitts and Nevis |
KP | North Korea |
KR | South Korea |
KW | Kuwait |
KY | Cayman Islands |
KZ | Kazakhstan |
LA | Laos |
LB | Lebanon |
LC | Saint Lucia |
LI | Liechtenstein |
LK | Sri Lanka |
LR | Liberia |
LS | Lesotho |
LT | Lithuania |
LU | Luxembourg |
LV | Latvia |
LY | Libya |
MA | Morocco |
MC | Monaco |
MD | Moldova |
ME | Montenegro |
MF | Saint Martin |
MG | Madagascar |
MH | Marshall Islands |
MK | Macedonia |
ML | Mali |
MM | Myanmar [Burma] |
MN | Mongolia |
MO | Macao |
MP | Northern Mariana Islands |
MQ | Martinique |
MR | Mauritania |
MS | Montserrat |
MT | Malta |
MU | Mauritius |
MV | Maldives |
MW | Malawi |
MX | Mexico |
MY | Malaysia |
MZ | Mozambique |
NC | New Caledonia |
NE | Niger |
NF | Norfolk Island |
NG | Nigeria |
NI | Nicaragua |
NL | Netherlands |
NO | Norway |
NP | Nepal |
NR | Nauru |
NU | Niue |
NZ | New Zealand |
OM | Oman |
PA | Panama |
PE | Peru |
PF | French Polynesia |
PG | Papua |
PH | Philippines |
PK | Pakistan |
PL | Poland |
PM | Saint Pierre and Miquelon |
PN | Pitcairn Islands |
PR | Puerto Rico |
PS | Palestine |
PT | Portugal |
PW | Palau |
PY | Paraguay |
QA | Qatar |
RE | Réunion |
RO | Romania |
RS | Serbia |
RU | Russia |
RW | Rwanda |
SA | Saudi Arabia |
SB | Solomon Islands |
SC | Seychelles |
SD | Sudan |
SE | Sweden |
SG | Singapore |
SH | Saint Helena |
SI | Slovenia |
SJ | Svalbard and Jan Mayen |
SK | Slovakia |
SL | Sierra Leone |
SM | San Marino |
SN | Senegal |
SO | Somalia |
SR | Suriname |
SS | South Sudan |
ST | São Tomé and Príncipe |
SV | El Salvador |
SX | Sint Maarten |
SY | Syria |
SZ | Swaziland |
TC | Turks and Caicos Islands |
TD | Chad |
TF | French Southern Territories |
TG | Togo |
TH | Thailand |
TJ | Tajikistan |
TK | Tokelau |
TL | East Timor |
TM | Turkmenistan |
TN | Tunisia |
TO | Tonga |
TR | Turkey |
TT | Trinidad and Tobago |
TV | Tuvalu |
TW | Taiwan |
TZ | Tanzania |
UA | Ukraine |
UG | Uganda |
UM | U.S. Minor Outlying Islands |
US | United States |
UY | Uruguay |
UZ | Uzbekistan |
VA | Vatican City |
VC | Saint Vincent and the Grenadines |
VE | Venezuela |
VG | British Virgin Islands |
VI | U.S. Virgin Islands |
VN | Vietnam |
VU | Vanuatu |
WF | Wallis and Futuna |
YE | Yemen |
YT | Mayotte |
ZA | South Africa |
ZM | Zambia |
ZW | Zimbabwe |
NA | Namibia |