tf文本处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 利用pandas 读取csv文件数据
data = pd.read_csv('./bbc-text.csv')
# 将data转换为numpy数据
data = data.to_numpy()
# 对数据进行提取
labels = data[:, 0]
sentences = data[:, 1]
# 创建分词器

# 这里设置分词器容量为100, 若超过100 则剩余的用<oov> 表示
# stokenizer = Tokenizer(100,oov_token='<oov>')

stokenizer = Tokenizer()
stokenizer.fit_on_texts(sentences)

# 输出词 词索引
word_index = stokenizer.word_index
print(word_index)

# 将文本转换为相应的 序列
sequences = stokenizer.texts_to_sequences(sentences)
print(sequences[0])

# 将小于指定长度的文本序列进行 0 填充,默认为前面填充,可用参数有padding = ‘post’/'pre',truncating = 'post'/'pre'
# pad_sequences(sequences, padding=None,truncating=None)
padded = pad_sequences(sequences)


print(padded[0])

输出:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
举例
sentences = ['i love you','do you have money', 'i like running']

word_index
{'i': 1, 'you': 2, 'love': 3, 'do': 4, 'have': 5, 'money': 6, 'like': 7, 'running': 8}

sequences
[[1, 3, 2], [4, 2, 5, 6], [1, 7, 8]]

padded 默认以最长句子为Maxlen ,在序列前面填充
[[0 1 3 2]
[4 2 5 6]
[0 1 7 8]]