/
utils.py
226 lines (178 loc) · 10.2 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
import os
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from nltk.tokenize import TweetTokenizer
import multiprocessing as mp
def read_data(exp_dataset : str, data_path:str = 'dataset', dataset2folder:dict = {'R8':'R8', 'R52':'R52', '20ng':'20ng', 'Ohsumed':'ohsumed_single_23', 'MR':'mr'}):
"""Read the dataset by handling multiple datasets' structure for each experiment and return in pandas DataFrame format.
Args:
exp_dataset (str): The experiment dataset to conduct.
data_path (str, optional): The folders that store the dataset. Defaults to 'dataset'.
dataset2folder (dict, optional): The dictionary mapping for the name and folder_name of the dataset. Defaults to {'R8':'R8', 'R52':'R52', '20ng':'20ng', 'Ohsumed':'ohsumed_single_23', 'MR':'mr'}.
Returns:
pandas.DataFrame: The DataFrame is constructed with columns: 1.target - Specify the each row of data is for train or test purpose.
2.label - The label for each row
3.text - The textual content for each row
"""
print("\nLoading dataset..")
dataset = []
# R20
if exp_dataset=='R8' or exp_dataset =='R52':
targets = ['train.txt', 'test.txt']
for target in targets:
text_data_path = os.path.join(data_path, dataset2folder[exp_dataset], target)
with open(text_data_path) as f:
lines = f.readlines()
for line in lines:
label, text = line.strip().split('\t')
# add doc
dataset.append((target[:-4], label, text))
# ohsumed_single_23
elif exp_dataset == 'Ohsumed':
targets = {'training':'train', 'test':'test'}
for target in targets:
trainind_data_path = os.path.join(data_path, dataset2folder[exp_dataset], target)
for label in os.listdir(trainind_data_path):
for doc in os.listdir(os.path.join(trainind_data_path, label)):
with open(os.path.join(trainind_data_path, label, doc)) as f:
lines = f.readlines()
text = " ".join([line.strip() for line in lines])
# add doc
dataset.append((targets[target], label, text))
# 20 ng
elif exp_dataset =='20ng':
from sklearn.datasets import fetch_20newsgroups
for target in ['train', 'test']:
data = fetch_20newsgroups(subset=target, shuffle=True, random_state=42, remove = ('headers', 'footers', 'quotes'))
dataset += list(map(lambda sample: (target, data['target_names'][sample[0]], sample[1].replace("\n", " ")), zip(data['target'], data['data'])))
# movie review
elif exp_dataset == 'MR':
for target in ['train', 'test']:
text_data_path = os.path.join(data_path, dataset2folder[exp_dataset], "text_{}.txt".format(target))
with open(text_data_path, 'rb') as f:
text_lines = f.readlines()
label_data_path = os.path.join(data_path, dataset2folder[exp_dataset], "label_{}.txt".format(target))
with open(label_data_path, 'rb') as f:
label_lines = f.readlines()
dataset += [(target, str(label.strip()), str(text.strip())) for (text, label) in zip(text_lines, label_lines)]
else:
print("Wrong dataset!")
exit()
print("\tDataset Loaded! Total:", len(dataset))
return pd.DataFrame(dataset, columns=["target", "label", "text"])
def data_preprocessing(data_pd:pd.DataFrame, max_len: int = 1000, tokenizer=TweetTokenizer()):
"""Tokenizing sentences/texts in the dataset and remove empty/long content.
Args:
data_pd (pd.DataFrame): The DataFrame of experiment dataset.
max_len (int, optional): The maximum numbers of text tokens to keep for each data. Set `0` for keep all the tokens. Defaults to 1000.
tokenizer (nltk.tokenize.Tokenizer(), optional): [description]. Defaults to TweetTokenizer().
Returns:
pandas.DataFrame: The DataFrame is constructed with an additional column `tokens` which is a set of tokens from `text` column.
"""
print("Data Proprecessing..")
print("\tText Tokenizing..")
with mp.Pool(mp.cpu_count()) as p:
data_pd['tokens'] = p.starmap(tokenize, map(lambda text: (text, tokenizer),data_pd.text.tolist()))
print("\tRemove data with empty content")
data_pd = data_pd[data_pd.tokens.apply(lambda tokens: (len(tokens) > 0))]
# Remove the text with too many text(remove outliers)
if max_len != 0:
print("\t\tData trimming, max sentence length:", max_len)
data_pd = data_pd[data_pd.tokens.apply(lambda tokens: (len(tokens) < max_len))]
## Uncomment to enable the filtering based on distribution
# print("\n Sentence length distrbution:\n")
# distribution = data_pd['tokens'].apply(lambda tokens: len(tokens)).describe()
# print(distribution)
# MAX_LEN = int(distribution['mean']+ 3*distribution['std'])
# print("\t\tRemove text > {} tokens (mean + 3x std of word counts)".format(MAX_LEN))
# data_pd = data_pd[data_pd.tokens.apply(lambda tokens: len(tokens) < distribution['mean'] + 3 * distribution['std'])]
print("\n\t\tFinish trimming, Total Rest:", len(data_pd))
return data_pd
def tokenize(text:str, tokenizer):
return [w.lower() for w in tokenizer.tokenize(text)]
def features_extracting(data_pd: pd.DataFrame, minimum_word_count: int = 15, neighbor_distance: int = 2) :
"""Extract words, word's neighbor features as well as transform the label as label index.
Args:
data_pd (pd.DataFrame): [description]
minimum_word_count (int, optional): [description]. Defaults to 15.
neighbor_distance (int, optional): [description]. Defaults to 2.
Returns:
pandas.DataFrame: The DataFrame is constructed with an additional column `X`, `NX`, `y` for word's index, neighbor word information and label index.
dict: The dictionary that mapping word to index
"""
print("\n Feature Extracting..")
word2idx = construct_word2idx(data_pd.tokens.to_list(), minimum_word_count= minimum_word_count)
# get index representation of words
data_pd['X'] = transform_word2idx_mp(data_pd.tokens.to_list(), word2idx=word2idx)
# get word's neighbors with specific neighbor distance
data_pd['X_Neighbors'] = get_word_neighbors_mp(data_pd.X.to_list(), neighbor_distance=neighbor_distance)
# get label by it category index
data_pd['y'] = pd.Categorical(data_pd.label).codes
print("\tFinish, adding `X`, `X_Neighbors` and `y` columns to DataFrame")
return data_pd, word2idx
def construct_word2idx(tokens_of_texts: list, minimum_word_count: int = 15) :
"""[summary]
Args:
tokens_of_texts (list): A list of the tokens of sentences/texts from dataset.
minimum_word_count (int, optional): The lowerbound of the wordcount, the word with frequency lower than it would be discard. Defaults to 15.
Returns:
dict: The dictionary that map word to its index, where index `0` is assigned to the padding `_PAD_` and last index for `_UNKNOW_` words (rare word as well).
int: The index of `_UNKNOW_` word.
"""
word_counts = Counter([w for tokens in tokens_of_texts for w in tokens])
print("\tMost common words:", word_counts.most_common(10))
# remove rare words
qualified_words = [w for w,v in word_counts.items() if v > minimum_word_count]
word2idx = {word:i+1 for i,word in enumerate(qualified_words)}
word2idx[0] = '_PAD_'
word2idx[len(qualified_words) + 1] = '_UNKNOW_'
print("\tTotal words:", len(qualified_words), ",Total # word embeddings:", len(word2idx))
return word2idx
def transform_word2idx_mp(tokens_of_texts: list, word2idx: dict) :
"""Multi processing version of converting a set of word token to it's index
Args:
tokens_of_texts (list): A list of the tokens of sentences/texts from dataset.
word2idx (dict): The dictionary that map word to its index, where index `0` is the padding `_PAD_` and last index for unknow.
Returns:
list: A list of indice of the tokens from sentence.
"""
print("\tConverting word to it's index")
with mp.Pool(4) as p:
return p.starmap(transform_word2idx, map(lambda tokens: (tokens, word2idx), tokens_of_texts))
def transform_word2idx(tokens: list, word2idx: dict) :
"""Converting a set of word token to it's index
Args:
tokens (list): A list of the tokens of sentences/texts from dataset.
word2idx (dict): The dictionary that map word to its index, where index `0` is the padding `_PAD_` and last index for unknow.
Returns:
list: A list of indice of the tokens from sentence.
"""
unknow_idx = len(word2idx)-1
return [word2idx.get(w) if word2idx.get(w) else unknow_idx for w in tokens]
def get_word_neighbors_mp(text_tokens:list, neighbor_distance:int) :
print("\tGet word's neighbors")
with mp.Pool(mp.cpu_count()) as p:
return p.starmap(get_word_neighbor, map(lambda tokens: (tokens, neighbor_distance), text_tokens))
def get_word_neighbor(text_tokens: list, neighbor_distance: int) :
"""Get word token's adjacency neighbors with distance : neighbor_distance
Args:
text_tokens (list): A list of the tokens of sentences/texts from dataset.
neighbor_distance (int): The adjacency distance to consider as a neighbor.
Returns:
list: A nested list with 2 dimensions, which is a list of neighbor word tokens (2nd dim) for all tokens (1nd dim)
"""
text_len = len(text_tokens)
edge_neighbors = []
for w_idx in range(text_len):
skip_neighbors = []
# check before
for sk_i in range(neighbor_distance):
before_idx = w_idx -1 - sk_i
skip_neighbors.append(text_tokens[before_idx] if before_idx > -1 else 0)
# check after
for sk_i in range(neighbor_distance):
after_idx = w_idx +1 +sk_i
skip_neighbors.append(text_tokens[after_idx] if after_idx < text_len else 0)
edge_neighbors.append(skip_neighbors)
return edge_neighbors