Collocation

Speech analysis

Introduction

Using dataset https://data.gov.tw/dataset/42540

Loading packages

library(tidyverse)
library(tidyr) 
library(jiebaR)
options(stringsAsFactors = F)
options(scipen = 999)

Loading data

raw.df <- readRDS("data/toChinaSpeech.rds") %>%
    mutate(doc_id = str_c("doc", str_pad(row_number(), 2, pad = "0"))) %>%
    mutate(nchar = nchar(content)) %>%
    select(doc_id, content, title, nchar) %>%
    mutate(content = str_replace(content, "【總統府新聞稿】", "")) %>%
    mutate(content = stringr::str_replace_all(content, "台灣", "臺灣"))

Initialize jieba

cutter <- worker()
segment_not <- c("蔡英文", "南向政策", "副總統", "大分", "新南向政策", "玉山論壇")
new_user_word(cutter, segment_not)
stopWords <- readRDS("data/stopWords.rds")

Significant words between docs

Log-ratio

doc_word_wide <- unnested.df %>%
  count(doc_id, word) %>%
  filter(n > 5) %>%
  spread(doc_id, n, fill = 0)

doc_relate_ratios <- doc_word_wide %>%
  select(word, docA = doc01, docB = doc03) %>%
  mutate(docA = (docA+1)/sum(docA+1)) %>%
  mutate(docB = (docB+1)/sum(docB+1)) %>%
  mutate(logratio = log2(docA / docB)) %>%
  arrange(desc(logratio))

doc_relate_ratios %>%
  group_by(logratio > 0) %>%
  top_n(10, abs(logratio)) %>%
  ungroup() %>%
  mutate(word = reorder(word, logratio)) %>%
  ggplot(aes(word, logratio, fill = logratio < 0)) +
  geom_col() + 
  coord_flip() + 
  theme(axis.text.y=element_text(colour="black", family="Heiti TC Light"))

tf-idf

library(tidytext)

tf_idf.df <- unnested.df %>%
    group_by(word) %>%
    filter(n() > 10) %>%
    ungroup() %>%
    count(doc_id, word) %>%
    bind_tf_idf(word, doc_id, n) %>%
    group_by(doc_id) %>%
    top_n(10, tf_idf) %>%
    mutate(word = reorder(word, tf_idf)) %>%
    ungroup()

tf_idf.df %>%
    ggplot() + aes(word, tf_idf) + 
    geom_col() +
    facet_wrap(~ doc_id, scales = "free") + 
    coord_flip() + 
    theme(axis.text.y=element_text(family="Heiti TC Light"))

Collocation

Sentence extraction

tidy.df <- raw.df %>% 
    mutate(sentence = str_split(content, "。")) %>%
    select(-content, -title) %>%
    unnest(sentence) %>%
    mutate(sentence_len = nchar(sentence)) %>%
    filter(sentence_len > 10) %>% 
    group_by(doc_id) %>%
    mutate(sentence_id = str_c(doc_id, str_pad(row_number(), 4, pad = "0"))) %>%
    ungroup()

tidy.df %>%
    ggplot() + aes(sentence_len) + 
    geom_density()

tokenization

unnested.df <- tidy.df %>%
    select(sentence_id, sentence) %>%
    mutate(word = purrr::map(sentence, function(x)segment(x, cutter))) %>%
    unnest(word) %>%
    filter(!is.na(word)) %>%
    filter(!(word %in% stopWords$word)) %>%
    filter(!str_detect(word, "[a-zA-Z0-9]+"))

pair counts

# install.packages("widyr")
library(widyr)

word_pairs <- unnested.df %>%
    filter(!word %in% c("我們", "臺灣", "總統")) %>%
    pairwise_count(word, sentence_id, sort = TRUE)

word_tf_idf <- unnested.df %>%
    filter(word %in% tf_idf.df$word) %>%
    pairwise_count(word, sentence_id, sort = TRUE) %>%

# install.packages("ggraph")
library(igraph)
library(ggraph)

word_pairs %>%
    ggplot(aes(n)) +
    geom_density()

set.seed(2016)
word_pairs %>%
  filter(n > 8) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), repel = F, family = "Heiti TC Light") +
  theme_void()

Pearson Correlation

phi-correlation https://en.wikipedia.org/wiki/Phi_coefficient

word_corr <- unnested.df %>%
    # filter(!word %in% c("我們", "臺灣")) %>%
    group_by(word) %>%
    filter(n() > 20) %>%
    pairwise_cor(word, sentence_id, sort = TRUE)

word_corr %>%
    ggplot(aes(correlation)) +
    geom_density()

set.seed(2016)
word_corr %>%
  filter(correlation > .15) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), repel = F, family = "Heiti TC Light") +
  theme_void()

Chi-square

PMI

word_pmi <- unnested.df %>%
    group_by(word) %>%
    filter(n() > 20) %>%
    pairwise_pmi(word, sentence_id, sort = TRUE)

word_pmi %>%
    ggplot(aes(pmi)) +
    geom_density()

set.seed(2016)
word_pmi %>%
  filter(pmi > mean(pmi) + 1.5*sd(pmi)) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = pmi, color = "blue"), show.legend = FALSE) +
  geom_node_point(color = "blue", size = 3, alpha=0.4) +
  geom_node_text(aes(label = name), repel = F, family = "Heiti TC Light") +
  theme_void()

PreviousRegular expression Next爬蟲概念簡介

Last updated 5 years ago

Was this helpful?