创建停用词表是中文文本处理中的一个常见步骤,用于过滤掉文本中的常用词,从而减少数据噪声,提高后续文本分析的效果。以下是使用Python创建停用词表的几种方法:
方法一:使用jieba库
import jiebadef stopwords_list(filepath):stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]return stopwords
方法二:使用nltk库
import nltknltk.download('stopwords')from nltk.corpus import stopwordsstopwords = set(stopwords.words('english'))print(stopwords)
方法三:手动创建停用词表

def stopwordlist():stopwords = [line.strip() for line in open('F:\\大数据\\大作业\\分词后的文件\\stopWord.txt', 'r', encoding='utf-8').readlines()]return stopwords
方法四:合并多个停用词表
import osdef merge_stopwords(path):stopwords = set()for file in os.listdir(path):if file.endswith('.txt'):with open(os.path.join(path, file), 'r', encoding='utf-8') as f:stopwords.update([line.strip() for line in f.readlines()])return list(stopwords)
方法五:过滤文本中的停用词
def filter_text(text, stopwords):lines = text.split('\n')filtered_lines = []for line in lines:words = line.split()filtered_words = [word for word in words if word not in stopwords]filtered_lines.append(' '.join(filtered_words))return '\n'.join(filtered_lines)
