Read CSV and JSON
讀取CSV檔
Copy import pandas as pd
df = pd.read_csv('data/14196_drug_adv.csv', error_bad_lines=False)
set(df.刊播媒體類別)
{'其他', '平面媒體', '廣播電台', '網路', '電視'}
讀取Excel檔
Copy import pandas as pd
df = pd.read_excel('../../R/clickbait_detection/labeled/tag_comparison_1st&2nd_round.xlsx', error_bad_lines=False)
計數以了解資料概況
Copy df.刊播媒體類別.value_counts()
`網路 1479
平面媒體 111
電視 86
廣播電台 71
其他 13
Name: 刊播媒體類別, dtype: int64
Copy from collections import Counter
type_dict = Counter(df.刊播媒體類別)
print(type_dict)
Counter({'網路': 1479, '平面媒體': 111, '電視': 86, '廣播電台': 71, '其他': 13})
1760
Pilot分析-群組化計數 group_by
Copy df.groupby("刊播媒體類別").count()
案例分析:摘要youbike
載入資料
Python (Option) by list comprehension
Copy import requests
data = requests.get('https://tcgbusfs.blob.core.windows.net/blobyoubike/YouBikeTP.gz').json()
all_list = []
for k, v in data["retVal"].items():
all_list.append(v)
ubike_df = pd.DataFrame(all_list)
ubike_df.head()
Copy import requests
data = requests.get('https://tcgbusfs.blob.core.windows.net/blobyoubike/YouBikeTP.gz').json()
all_list = [v for v in data["retVal"].values()]
ubike_df = pd.DataFrame(all_list)
ubike_df.head()
產生新的變項
產生新的變數(方法一)df = df.assign(new_var = old_var1 / old_var2)
to create or convert new variable. Be careful! You must assign to left to overwrite original df.
產生新的變數(方法二)df["new_var"] = df.old_var1 / df.old_var2
Python output
Copy ubike_df = ubike_df.assign(sbi = pd.to_numeric(ubike_df.sbi),\
tot = pd.to_numeric(ubike_df.tot))
ubike_df = ubike_df.assign(test = ubike_df.sbi/ubike_df.tot)
ubike_df.info()
Copy <class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 16 columns):
act 400 non-null object
ar 400 non-null object
aren 400 non-null object
bemp 400 non-null object
lat 400 non-null object
lng 400 non-null object
mday 400 non-null object
sarea 400 non-null object
sareaen 400 non-null object
sbi 400 non-null int64
sna 400 non-null object
snaen 400 non-null object
sno 400 non-null object
tot 400 non-null int64
ratio 400 non-null float64
test 400 non-null float64
dtypes: float64(2), int64(2), object(12)
memory usage: 50.1+ KB
觀察資料概況
觀察各個變數的分佈 df.info()
and df.describe()
修改變數型態 pd.to_numeric(var)
to convert data type
Read R RDS
The root node of RDS converted to pandas dataframe will be result[None]
.
Copy from os import listdir, getcwd
from os.path import isfile, join, isdir
import pyreadr
import pickle
import re
def detectFiletypes(fname, types="txt|tsv|csv"):
types = types.replace("|", "$|")
return bool(re.search(types + "$", fname))
def rds_to_pickle(fpath = None):
if fpath == None:
fpath = getcwd()
print("Without specify path, set to current directory automatically!")
print(getcwd())
if isdir(fpath):
fns = [f for f in listdir(fpath) if isfile(join(fpath, f)) and detectFiletypes(f.lower(), "rds")]
for fn in fns:
result = pyreadr.read_r(fpath + "/" + fn)
df = result[None]
fn_p = fn.split(".")[0] + ".p"
pickle.dump(df, open(fpath + "/" + fn_p, "wb"))
Read R RDA
Download data here for testing https://www.dropbox.com/s/qrgi3ralwqrhbq5/boy-girl_201906160922.rda?dl=0
Copy # !pip install pyreadr
import pyreadr
result = pyreadr.read_r('../R/ptt_scraping/pttdata/merged/boy-girl_201906160922.rda')
print(result.keys())
post = result["allp.df"]
print(post.keys())
odict_keys(['allc.df', 'allp.df'])
Index(['plink', 'board', 'pcontent', 'poster', 'ptitle', 'ptime', 'ipaddr', 'ip.len'], dtype='object')
Tokenizing post content
Copy import re
post['ptext'] = post['pcontent'].apply(lambda x:x.replace("\n", ""))
post['ptext'] = post['ptext'].apply(lambda x:re.sub("\s", "", x))
post.head(5)
M1. Tokenize one column by for-loo
Copy import jieba
token_post = []
i = 0
for p in post['ptext']:
token_post.append(list(jieba.cut(p)))
i += 1
if(i%1000 == 0):
print(i)
M2. Tokenize pandas columns by `apply()`
Copy import pandas as pd
import jieba
def cut_word(word):
cw = jieba.cut(word)
return list(cw)
df['cut_word'] = df['word'].apply(cut_word)
pd.Series(df['cut_word'].sum()).value_counts()
Applications: Building word2vec model
Copy from gensim.models import Word2Vec
model = Word2Vec(token_post, min_count=20, size=300, window=8, workers=4)