朴素贝叶斯实现文本分类
python 代码注释

代码部分

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from numpy import*
def loadDataSet():
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'wprthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']] #进行词条切分后的文档集合
classVec = [0, 1, 0, 1, 0, 1] #1表示侮辱性文字,0表示正常言论
return postingList, classVec

def createVocabList(dataSet):
vocabSet = set([]) #创建一个空集
for document in dataSet:
vocabSet = vocabSet | set(document) #通过并集操作将数据传入set,并且这是一个无重复集合
return list(vocabSet)

def setOfWords2Vec(vocabSet,inputSet): #vocabSet词汇表,inputSet输入文档
returnVec=[0]*len(vocabSet) #创建0向量
for word in inputSet:
if word in vocabSet:
returnVec[vocabSet.index(word)] = 1
else:
print("The word :%s is not in the vocabulary!" % word)
return returnVec

def trainNb0(trainMatrix,trainCategory): #trainMatrix文档矩阵 trainCategory标签向量
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(trainCategory) / float(numTrainDocs) #P(C1) P(C2)=1-P(C1)
p0Num = ones(numWords)
p1Num = ones(numWords)
p0Denom=2.0 ; p1Denom=2.0
for i in range(numTrainDocs):
if trainCategory[i]==1:
p1Num +=trainMatrix[i] #p1Num 词向量的向量相加
p1Denom += sum(trainMatrix[i]) #p1Denom 标签向量相加
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vest = log(p1Num/p1Denom) #P(w|1)
p0Vest = log(p0Num/p0Denom) #P(W|0)
return p0Vest,p1Vest,pAbusive

import bayes


def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1)
p0 = sum(vec2Classify * p0Vec) + log(1.0-pClass1)
if p1>p0:
return 1
else:
return 0

def testingNB(): #训练及使用分类器
listOposts,listClasses=bayes.loadDataSet()
myVocabList=bayes.createVocabList(listOposts)
trainMat=[]
for postinDoc in listOposts:
trainMat.append(bayes.setOfWords2Vec(myVocabList,postinDoc))
p0V,p1V,pAb=bayes.trainNb0(trainMat,listClasses)
testEntry=['love','my','dalmation']
thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
print (testEntry, 'the classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))

注释部分

1.Python 集合set()添加删除、交集、并集、集合操作

  • a=set(‘boy’) –创建set
  • a.add(‘python’) –添加
  • a.update(‘python’) –把传入的元素拆分,作为个体传入
  • a.remove(‘python’) –删除
  • 集合操作
    集合操作