class BayesText:

def __init__(self, trainingdir, stopwordlist):

"""This class implements a naive Bayes approach to text


trainingdir is the training data. Each subdirectory of

trainingdir is titled with the name of the classification

category -- those subdirectories in turn contain the text

files for that category.

The stopwordlist is a list of words (one per line) will be

removed before any counting takes place.


self.vocabulary = {}

self.prob = {}

self.totals = {}

self.stopwords = {} #停用词字典

f = open(stopwordlist)

for line in f:

self.stopwords[line.strip()] = 1


categories = os.listdir(trainingdir)

#filter out files that are not directories

self.categories = [filename for filename in categories

if os.path.isdir(trainingdir + filename)]

print("Counting ...")

for category in self.categories:

print(' ' + category)

# 计算当前类别的单词和单词数量,单词的总量


self.totals[category]) = self.train(trainingdir, category)

# I am going to eliminate any word in the 所有种类的单词库vocabulary

# that doesn't occur at least 3 times

toDelete = []

for word in self.vocabulary:

if self.vocabulary[word] <3:

# mark word for deletion

# can&#39;t delete now because you can&#39;t delete

# from a list you are currently iterating over


# now delete

for word in toDelete:

del self.vocabulary[word]

# now compute probabilities

vocabLength &#61; len(self.vocabulary)

print("Computing probabilities:")

for category in self.categories:

print(&#39; &#39; &#43; category)

denominator &#61; self.totals[category] &#43; vocabLength

for word in self.vocabulary:

if word in self.prob[category]:

count &#61; self.prob[category][word]


count &#61; 1

# 条件概率计算

self.prob[category][word] &#61; (float(count &#43; 1)

/ denominator)

print ("DONE TRAINING\n\n")

# input&#xff1a;trainingdir训练文件的目录, category训练文件的种类

# return: (counts, total) (当前文件的单词和单词数量,所有单词的数量)

def train(self, trainingdir, category):

"""counts word occurrences for a particular category"""

currentdir &#61; trainingdir &#43; category

files &#61; os.listdir(currentdir)

counts &#61; {}

total &#61; 0

for file in files:

#print(currentdir &#43; &#39;/&#39; &#43; file)

f &#61; codecs.open(currentdir &#43; &#39;/&#39; &#43; file, &#39;r&#39;, &#39;iso8859-1&#39;)

for line in f:

tokens &#61; line.split()

for token in tokens:

# get rid of punctuation and lowercase token

token &#61; token.strip(&#39;\&#39;".,?:-&#39;)

token &#61; token.lower()

if token !&#61; &#39;&#39; and not token in self.stopwords:

self.vocabulary.setdefault(token, 0)

self.vocabulary[token] &#43;&#61; 1#所有文档的单词和单词数量

counts.setdefault(token, 0)

counts[token] &#43;&#61; 1#当前文件的单词和单词数量

total &#43;&#61; 1#所有单词的数量


return(counts, total)

# test code

bT &#61; BayesText(trainingDir, stoplistfile)






def classify(self, itemVector, numVector):

"""Return class we think item Vector is in"""

results &#61; []

sqrt2pi &#61; math.sqrt(2 * math.pi)

for (category, prior) in self.prior.items():

prob &#61; prior

col &#61; 1

for attrValue in itemVector:

if not attrValue in self.conditional[category][col]:

# we did not find any instances of this attribute value

# occurring with this category so prob &#61; 0

prob &#61; 0


prob &#61; prob * self.conditional[category][col][attrValue]

col &#43;&#61; 1

col &#61; 1

for x in numVector:

mean &#61; self.means[category][col]

ssd &#61; self.ssd[category][col]

ePart &#61; math.pow(math.e, -(x - mean)**2/(2*ssd**2))

prob &#61; prob * ((1.0 / (sqrt2pi*ssd)) * ePart)

col &#43;&#61; 1

results.append((prob, category))

# return the category with the highest probability



# test code

bT.classify(testDir&#43; &#39;rec.motorcycles/104673&#39;)

10-fold cross

from __future__ import print_function

import os, codecs, math

class BayesText:

# input:训练文件目录&#xff0c;停用词&#xff0c;忽略的文件子集

def __init__(self, trainingdir, stopwordlist, ignoreBucket):

"""This class implements a naive Bayes approach to text


trainingdir is the training data. Each subdirectory of

trainingdir is titled with the name of the classification

category -- those subdirectories in turn contain the text

files for that category.

The stopwordlist is a list of words (one per line) will be

removed before any counting takes place.


self.vocabulary &#61; {}

self.prob &#61; {}

self.totals &#61; {}

self.stopwords &#61; {}

f &#61; open(stopwordlist)

for line in f:

self.stopwords[line.strip()] &#61; 1


categories &#61; os.listdir(trainingdir)

#filter out files that are not directories&#xff0c;in this program, neg and pos

self.categories &#61; [filename for filename in categories

if os.path.isdir(trainingdir &#43; filename)]

print("Counting ...")

for category in self.categories:

#print(&#39; &#39; &#43; category)


self.totals[category]) &#61; self.train(trainingdir, category,


# I am going to eliminate any word in the vocabulary

# that doesn&#39;t occur at least 3 times

toDelete &#61; []

for word in self.vocabulary:

if self.vocabulary[word] <3:

# mark word for deletion

# can&#39;t delete now because you can&#39;t delete

# from a list you are currently iterating over


# now delete

for word in toDelete:

del self.vocabulary[word]

# now compute probabilities

vocabLength &#61; len(self.vocabulary)

#print("Computing probabilities:")

for category in self.categories:

#print(&#39; &#39; &#43; category)

denominator &#61; self.totals[category] &#43; vocabLength

for word in self.vocabulary:

if word in self.prob[category]:

count &#61; self.prob[category][word]


count &#61; 1

self.prob[category][word] &#61; (float(count &#43; 1)

/ denominator)

#print ("DONE TRAINING\n\n")

def train(self, trainingdir, category, bucketNumberToIgnore):

"""counts word occurrences for a particular category"""

ignore &#61; "%i" % bucketNumberToIgnore

currentdir &#61; trainingdir &#43; category

directories &#61; os.listdir(currentdir)

counts &#61; {}

total &#61; 0

for directory in directories:

if directory !&#61; ignore:

currentBucket &#61; trainingdir &#43; category &#43; "/" &#43; directory

files &#61; os.listdir(currentBucket)

#print(" " &#43; currentBucket)

for file in files:

f &#61; codecs.open(currentBucket &#43; &#39;/&#39; &#43; file, &#39;r&#39;, &#39;iso8859-1&#39;)

for line in f:

tokens &#61; line.split()

for token in tokens:

# get rid of punctuation and lowercase token

token &#61; token.strip(&#39;\&#39;".,?:-&#39;)

token &#61; token.lower()

if token !&#61; &#39;&#39; and not token in self.stopwords:

self.vocabulary.setdefault(token, 0)

self.vocabulary[token] &#43;&#61; 1

counts.setdefault(token, 0)

counts[token] &#43;&#61; 1

total &#43;&#61; 1


return(counts, total)

def classify(self, filename):

results &#61; {}

for category in self.categories:

results[category] &#61; 0

f &#61; codecs.open(filename, &#39;r&#39;, &#39;iso8859-1&#39;)

for line in f:

tokens &#61; line.split()

for token in tokens:


token &#61; token.strip(&#39;\&#39;".,?:-&#39;).lower()

if token in self.vocabulary:

for category in self.categories:

if self.prob[category][token] &#61;&#61; 0:

print("%s %s" % (category, token))

results[category] &#43;&#61; math.log(



results &#61; list(results.items())

results.sort(key&#61;lambda tuple: tuple[1], reverse &#61; True)

# for debugging I can change this to give me the entire list

return results[0][0]

# input: 测试文件的分类目录&#xff0c;当前类别&#xff0c; 忽略子集

# return: 当前类别下的分类结果{0:12,1&#xff1a;23}

def testCategory(self, direc, category, bucketNumber):

results &#61; {}

directory &#61; direc &#43; ("%i/" % bucketNumber)

#print("Testing " &#43; directory)

files &#61; os.listdir(directory)

total &#61; 0

#correct &#61; 0

for file in files:

total &#43;&#61; 1

result &#61; self.classify(directory &#43; file)

results.setdefault(result, 0)

results[result] &#43;&#61; 1

#if result &#61;&#61; category:

# correct &#43;&#61; 1

return results

# input: 测试文件目录&#xff0c; 忽略的子集文件

# return: 所有类别的分类结果{1:{0:12,1&#xff1a;23},}

def test(self, testdir, bucketNumber):

"""Test all files in the test directory--that directory is

organized into subdirectories--each subdir is a classification


results &#61; {}

categories &#61; os.listdir(testdir)

#filter out files that are not directories

categories &#61; [filename for filename in categories if

os.path.isdir(testdir &#43; filename)]

for category in categories:

#print(".", end&#61;"")

results[category] &#61; self.testCategory(

testdir &#43; category &#43; &#39;/&#39;, category, bucketNumber)

return results

def tenfold(dataPrefix, stoplist):

results &#61; {}

for i in range(0,10):

bT &#61; BayesText(dataPrefix, stoplist, i)

r &#61; bT.test(theDir, i)

for (key, value) in r.items():

results.setdefault(key, {})

for (ckey, cvalue) in value.items():

results[key].setdefault(ckey, 0)

results[key][ckey] &#43;&#61; cvalue

categories &#61; list(results.keys())


print( "\n Classified as: ")

header &#61; " "

subheader &#61; " &#43;"

for category in categories:

header &#43;&#61; "% 2s " % category

subheader &#43;&#61; "-----&#43;"

print (header)

print (subheader)

total &#61; 0.0

correct &#61; 0.0

for category in categories:

row &#61; " %s |" % category

for c2 in categories:

if c2 in results[category]:

count &#61; results[category][c2]


count &#61; 0

row &#43;&#61; " %3i |" % count

total &#43;&#61; count

if c2 &#61;&#61; category:

correct &#43;&#61; count



print("\n%5.3f percent correct" %((correct * 100) / total))

print("total of %i instances" % total)

# change these to match your directory structure

prefixPath &#61; "data/review_polarity/"

theDir &#61; prefixPath &#43; "/txt_sentoken/"

stoplistfile &#61; prefixPath &#43; "stopwords25.txt"

tenfold(theDir, stoplistfile)

