Collocation
Last updated
Was this helpful?
Last updated
Was this helpful?
Using dataset
library(tidyverse)
library(tidyr)
library(jiebaR)
options(stringsAsFactors = F)
options(scipen = 999)
raw.df <- readRDS("data/toChinaSpeech.rds") %>%
mutate(doc_id = str_c("doc", str_pad(row_number(), 2, pad = "0"))) %>%
mutate(nchar = nchar(content)) %>%
select(doc_id, content, title, nchar) %>%
mutate(content = str_replace(content, "【總統府新聞稿】", "")) %>%
mutate(content = stringr::str_replace_all(content, "台灣", "臺灣"))
cutter <- worker()
segment_not <- c("蔡英文", "南向政策", "副總統", "大分", "新南向政策", "玉山論壇")
new_user_word(cutter, segment_not)
stopWords <- readRDS("data/stopWords.rds")
doc_word_wide <- unnested.df %>%
count(doc_id, word) %>%
filter(n > 5) %>%
spread(doc_id, n, fill = 0)
doc_relate_ratios <- doc_word_wide %>%
select(word, docA = doc01, docB = doc03) %>%
mutate(docA = (docA+1)/sum(docA+1)) %>%
mutate(docB = (docB+1)/sum(docB+1)) %>%
mutate(logratio = log2(docA / docB)) %>%
arrange(desc(logratio))
doc_relate_ratios %>%
group_by(logratio > 0) %>%
top_n(10, abs(logratio)) %>%
ungroup() %>%
mutate(word = reorder(word, logratio)) %>%
ggplot(aes(word, logratio, fill = logratio < 0)) +
geom_col() +
coord_flip() +
theme(axis.text.y=element_text(colour="black", family="Heiti TC Light"))
library(tidytext)
tf_idf.df <- unnested.df %>%
group_by(word) %>%
filter(n() > 10) %>%
ungroup() %>%
count(doc_id, word) %>%
bind_tf_idf(word, doc_id, n) %>%
group_by(doc_id) %>%
top_n(10, tf_idf) %>%
mutate(word = reorder(word, tf_idf)) %>%
ungroup()
tf_idf.df %>%
ggplot() + aes(word, tf_idf) +
geom_col() +
facet_wrap(~ doc_id, scales = "free") +
coord_flip() +
theme(axis.text.y=element_text(family="Heiti TC Light"))
tidy.df <- raw.df %>%
mutate(sentence = str_split(content, "。")) %>%
select(-content, -title) %>%
unnest(sentence) %>%
mutate(sentence_len = nchar(sentence)) %>%
filter(sentence_len > 10) %>%
group_by(doc_id) %>%
mutate(sentence_id = str_c(doc_id, str_pad(row_number(), 4, pad = "0"))) %>%
ungroup()
tidy.df %>%
ggplot() + aes(sentence_len) +
geom_density()
unnested.df <- tidy.df %>%
select(sentence_id, sentence) %>%
mutate(word = purrr::map(sentence, function(x)segment(x, cutter))) %>%
unnest(word) %>%
filter(!is.na(word)) %>%
filter(!(word %in% stopWords$word)) %>%
filter(!str_detect(word, "[a-zA-Z0-9]+"))
# install.packages("widyr")
library(widyr)
word_pairs <- unnested.df %>%
filter(!word %in% c("我們", "臺灣", "總統")) %>%
pairwise_count(word, sentence_id, sort = TRUE)
word_tf_idf <- unnested.df %>%
filter(word %in% tf_idf.df$word) %>%
pairwise_count(word, sentence_id, sort = TRUE) %>%
# install.packages("ggraph")
library(igraph)
library(ggraph)
word_pairs %>%
ggplot(aes(n)) +
geom_density()
set.seed(2016)
word_pairs %>%
filter(n > 8) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), repel = F, family = "Heiti TC Light") +
theme_void()
word_corr <- unnested.df %>%
# filter(!word %in% c("我們", "臺灣")) %>%
group_by(word) %>%
filter(n() > 20) %>%
pairwise_cor(word, sentence_id, sort = TRUE)
word_corr %>%
ggplot(aes(correlation)) +
geom_density()
set.seed(2016)
word_corr %>%
filter(correlation > .15) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), repel = F, family = "Heiti TC Light") +
theme_void()
word_pmi <- unnested.df %>%
group_by(word) %>%
filter(n() > 20) %>%
pairwise_pmi(word, sentence_id, sort = TRUE)
word_pmi %>%
ggplot(aes(pmi)) +
geom_density()
set.seed(2016)
word_pmi %>%
filter(pmi > mean(pmi) + 1.5*sd(pmi)) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = pmi, color = "blue"), show.legend = FALSE) +
geom_node_point(color = "blue", size = 3, alpha=0.4) +
geom_node_text(aes(label = name), repel = F, family = "Heiti TC Light") +
theme_void()
phi-correlation