@hainingwyx 2016-11-16T06:49:02.000000Z 字数 50066 阅读 3024

写给程序员的数据挖掘实战--读书笔记

数据挖掘 Python

写给程序员的数据挖掘实战--读书笔记

写在之前

本书涉及的源程序和数据都可以在以下网站中找到：http://guidetodatamining.com/。

这本书理论比较简单，书中错误较少，动手锻炼较多，如果每个代码都自己写出来，收获不少。总结：适合入门。

欢迎转载，转载请注明出处，如有问题欢迎指正。

协同过滤

相似用户评判标准：曼哈顿距离、欧氏距离、明氏距离。

$d(x, y)=(\sum _{k=1}^ {n}|x_k-y_k|)^{\frac{1}{r}}$

# Manhattan.py
users = {
    "Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5,
                 "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, 
                 "Vampire Weekend": 2.0},
    "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, 
             "Phoenix": 2.0,"Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},
    "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, 
           "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0},
    "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, 
          "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, 
          "Vampire Weekend": 2.0},
    "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, 
             "The Strokes": 4.0, "Vampire Weekend": 1.0},
    "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, 
            "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, 
            "Vampire Weekend": 4.0},
    "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0,
           "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0},
    "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0,
                "Slightly Stoopid": 2.5, "The Strokes": 3.0}
        }
def manhattan(rating1, rating2):
    """Computes the Manhattan distance. Both rating1 and rating2 are dictionaries
       of the form {'The Strokes': 3.0, 'Slightly Stoopid': 2.5}"""
    distance = 0
    commonRatings = False 
    for key in rating1:
        if key in rating2:
            distance += abs(rating1[key] - rating2[key])
            commonRatings = True
    if commonRatings:
        return distance
    else:
        return -1 #Indicates no ratings in common
def computeNearestNeighbor(username, users):
    """creates a sorted list of users based on their distance to username"""
    distances = []
    for user in users:
        if user != username:
            distance = manhattan(users[user], users[username])
            distances.append((distance, user))
    # sort based on distance -- closest first
    distances.sort()
    return distances
def recommend(username, users):
    """Give list of recommendations"""
    # first find nearest neighbor
    nearest = computeNearestNeighbor(username, users)[0][1]
    print nearest
    recommendations = []
    # now find bands neighbor rated that user didn't
    neighborRatings = users[nearest]
    userRatings = users[username]
    for artist in neighborRatings:
        if not artist in userRatings:
            recommendations.append((artist, neighborRatings[artist]))
    # using the fn sorted for variety - sort is more efficient
    return sorted(recommendations, key=lambda artistTuple: artistTuple[1], reverse = True)

# -*- coding: utf-8 -*-
from math import sqrt
users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0},
         "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},
         "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0},
         "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0},
         "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0},
         "Jordyn":  {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0},
         "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0},
         "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0}
        }
#明氏距离
def minkowski(rating1,rating2,r):
    distance=0
    commonRatings=False
    for key in rating1:
        if key in rating2:
            distance += pow(abs(rating1[key]-rating2[key]),r)
            commonRatings=True
    distance = pow(distance,1.0/r)
    if commonRatings:
        return distance
    else:
        return -1 #Indicates no ratings in common
def computeNearestNeighbor(username, users):
    """creates a sorted list of users based on their distance to username"""
    distances = []
    for user in users:
        if user != username:
            distance = minkowski(users[user], users[username],3)
            distances.append((distance, user))
    # sort based on distance -- closest first
    distances.sort()
    return distances
def recommend(username, users):
    """Give list of recommendations"""
    # first find nearest neighbor
    nearest = computeNearestNeighbor(username, users)[0][1]
    print nearest
    recommendations = []
    # now find bands neighbor rated that user didn't
    neighborRatings = users[nearest]
    userRatings = users[username]
    for artist in neighborRatings:
        if not artist in userRatings:
            recommendations.append((artist, neighborRatings[artist]))
    # using the fn sorted for variety - sort is more efficient
    return sorted(recommendations, key=lambda artistTuple: artistTuple[1], reverse = True)

但是可能存在常数差别，但是两者爱好相同的问题。

皮尔逊相关系数：

$r = \frac {\sum_{i=1}^n (x_i-\bar x)(y_i-\bar y)} {\sqrt{\sum_{i=1}^n(x_i-\bar x)^2} \sqrt{\sum_{i=1}^n(y_i-\bar y)^2}}$

# Pearson.py
from math import sqrt
users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0},
         "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},
         "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0},
         "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0},
         "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0},
         "Jordyn":  {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0},
         "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0},
         "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0}
        }
# 这里为了简单使用近似代替
def pearson(rating1,rating2):
    sum_xy=0
    sum_x=0
    sum_y=0
    sum_x2=0
    sum_y2=0
    n=0
    for key in rating1:
        if key in rating2:
            n += 1
            x = rating1[key]
            y = rating2[key]
            sum_xy += x*y
            sum_x += x
            sum_y += y
            sum_x2 += x**2
            sum_y2 += y**2
    denominnator = sqrt(sum_x2-(sum_x**2)/n)*sqrt(sum_y2-(sum_y**2)/n)
    if denominnator == 0:
        return 0
    else:
        return (sum_xy-(sum_x*sum_y)/n)/denominnator
def cos_like(rating1,rating2):
    innerProd=0
    vector_x=0
    vectoy_y=0
    for key in rating1:
        if key in rating2:
            x=rating1[key]
            y=rating2[key]
            innerProd += x*y
            vector_x += x**2
            vectoy_y += y**2
        denominnator = sqrt(vector_x) * sqrt(vectoy_y)
    if denominnator == 0:
        return 0
    else:
        return innerProd / denominnator

余弦相似度：

$cos(x, y)=\frac{x·y}{||x||\times ||y||}$
总结：如果数据稠密使用欧氏距离；如果数据稀疏，使用余弦相似度；如果用户评级范围不同，使用皮尔逊相关系数。但是如果仅仅是基于一个用户进行推荐，个别用户的怪癖也会被推荐。

k近邻：利用k个最相似的用户确定推荐结果，K和应有有关。利用皮尔逊系数来确定每个人的影响因子。

# A dictionary of movie critics and their ratings of a small
# set of movies
critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
 'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 
 'The Night Listener': 3.0},
'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 
 'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0, 
 'You, Me and Dupree': 3.5}, 
'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
 'Superman Returns': 3.5, 'The Night Listener': 4.0},
'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
 'The Night Listener': 4.5, 'Superman Returns': 4.0, 
 'You, Me and Dupree': 2.5},
'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 
 'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
 'You, Me and Dupree': 2.0}, 
'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
 'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}
from math import sqrt
# Returns a distance-based similarity score for person1 and person2
def sim_distance(prefs,person1,person2):
  # Get the list of shared_items
  si={}
  for item in prefs[person1]: 
    if item in prefs[person2]: si[item]=1
  # if they have no ratings in common, return 0
  if len(si)==0: return 0
  # Add up the squares of all the differences
  sum_of_squares=sum([pow(prefs[person1][item]-prefs[person2][item],2) 
                      for item in prefs[person1] if item in prefs[person2]])
  return 1/(1+sum_of_squares)
# Returns the Pearson correlation coefficient for p1 and p2
def sim_pearson(prefs,p1,p2):
  # Get the list of mutually rated items
  si={}
  for item in prefs[p1]: 
    if item in prefs[p2]: si[item]=1
  # if they are no ratings in common, return 0
  if len(si)==0: return 0
  # Sum calculations
  n=len(si)
  # Sums of all the preferences
  sum1=sum([prefs[p1][it] for it in si])
  sum2=sum([prefs[p2][it] for it in si])
  # Sums of the squares
  sum1Sq=sum([pow(prefs[p1][it],2) for it in si])
  sum2Sq=sum([pow(prefs[p2][it],2) for it in si])   
  # Sum of the products
  pSum=sum([prefs[p1][it]*prefs[p2][it] for it in si])
  # Calculate r (Pearson score)
  num=pSum-(sum1*sum2/n)
  den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
  if den==0: return 0
  r=num/den
  return r
# Returns the best matches for person from the prefs dictionary. 
# Number of results and similarity function are optional params.
def topMatches(prefs,person,n=5,similarity=sim_pearson):
  scores=[(similarity(prefs,person,other),other) 
                  for other in prefs if other!=person]
  scores.sort()
  scores.reverse()
  return scores[0:n]
# Gets recommendations for a person by using a weighted average
# of every other user's rankings
def getRecommendations(prefs,person,similarity=sim_pearson):
  totals={}
  simSums={}
  for other in prefs:
    # don't compare me to myself
    if other==person: continue
    sim=similarity(prefs,person,other)
    # ignore scores of zero or lower
    if sim<=0: continue
    for item in prefs[other]:
      # only score movies I haven't seen yet
      if item not in prefs[person] or prefs[person][item]==0:
        # Similarity * Score
        totals.setdefault(item,0)
        totals[item]+=prefs[other][item]*sim
        # Sum of similarities
        simSums.setdefault(item,0)
        simSums[item]+=sim
  # Create the normalized list
  rankings=[(total/simSums[item],item) for item,total in totals.items()]
  # Return the sorted list
  rankings.sort()
  rankings.reverse()
  return rankings
def transformPrefs(prefs):
  result={}
  for person in prefs:
    for item in prefs[person]:
      result.setdefault(item,{})
      # Flip item and person
      result[item][person]=prefs[person][item]
  return result
def calculateSimilarItems(prefs,n=10):
  # Create a dictionary of items showing which other items they
  # are most similar to.
  result={}
  # Invert the preference matrix to be item-centric
  itemPrefs=transformPrefs(prefs)
  c=0
  for item in itemPrefs:
    # Status updates for large datasets
    c+=1
    if c%100==0: print "%d / %d" % (c,len(itemPrefs))
    # Find the most similar items to this one
    scores=topMatches(itemPrefs,item,n=n,similarity=sim_distance)
    result[item]=scores
  return result
def getRecommendedItems(prefs,itemMatch,user):
  userRatings=prefs[user]
  scores={}
  totalSim={}
  # Loop over items rated by this user
  for (item,rating) in userRatings.items( ):
    # Loop over items similar to this one
    for (similarity,item2) in itemMatch[item]:
      # Ignore if this user has already rated this item
      if item2 in userRatings: continue
      # Weighted sum of rating times similarity
      scores.setdefault(item2,0)
      scores[item2]+=similarity*rating
      # Sum of all the similarities
      totalSim.setdefault(item2,0)
      totalSim[item2]+=similarity
  # Divide each total score by total weighting to get an average
  rankings=[(score/totalSim[item],item) for item,score in scores.items( )]
  # Return the rankings from highest to lowest
  rankings.sort( )
  rankings.reverse( )
  return rankings
def loadMovieLens(path='C:\Users\WangYixin\Desktop\PCI_Code\PCI_Code Folder\chapter2\data'):
  # Get movie titles
  movies={}
  for line in open(path+'/u.item'):
    (id,title)=line.split('|')[0:2]
    movies[id]=title
  # Load data
  prefs={}
  for line in open(path+'/u.data'):
    (user,movieid,rating,ts)=line.split('\t')
    prefs.setdefault(user,{})
    prefs[user][movies[movieid]]=float(rating)
  return prefs

# -*- coding: utf-8 -*-
# 推荐类
import codecs 
from math import sqrt
users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0,
                      "Norah Jones": 4.5, "Phoenix": 5.0,
                      "Slightly Stoopid": 1.5,
                      "The Strokes": 2.5, "Vampire Weekend": 2.0},
         "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5,
                 "Deadmau5": 4.0, "Phoenix": 2.0,
                 "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},
         "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0,
                  "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5,
                  "Slightly Stoopid": 1.0},
         "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0,
                 "Deadmau5": 4.5, "Phoenix": 3.0,
                 "Slightly Stoopid": 4.5, "The Strokes": 4.0,
                 "Vampire Weekend": 2.0},
         "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0,
                    "Norah Jones": 4.0, "The Strokes": 4.0,
                    "Vampire Weekend": 1.0},
         "Jordyn":  {"Broken Bells": 4.5, "Deadmau5": 4.0,
                     "Norah Jones": 5.0, "Phoenix": 5.0,
                     "Slightly Stoopid": 4.5, "The Strokes": 4.0,
                     "Vampire Weekend": 4.0},
         "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0,
                 "Norah Jones": 3.0, "Phoenix": 5.0,
                 "Slightly Stoopid": 4.0, "The Strokes": 5.0},
         "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0,
                      "Phoenix": 4.0, "Slightly Stoopid": 2.5,
                      "The Strokes": 3.0}
        }
class recommender:
    def __init__(self, data, k=1, metric='pearson', n=5):
        """ initialize recommender
        currently, if data is dictionary the recommender is initialized
        to it.
        For all other data types of data, no initialization occurs
        k is the k value for k nearest neighbor
        metric is which distance formula to use
        n is the maximum number of recommendations to make"""
        self.k = k
        self.n = n
        self.username2id = {}
        self.userid2name = {}
        self.productid2name = {}
        # for some reason I want to save the name of the metric
        self.metric = metric
        if self.metric == 'pearson':
            self.fn = self.pearson
        #
        # if data is dictionary set recommender data to it
        #
        if type(data).__name__ == 'dict':
            self.data = data
    def convertProductID2name(self, id):
        """Given product id number return product name"""
        if id in self.productid2name:
            return self.productid2name[id]
        else:
            return id
    def userRatings(self, id, n):
        """Return n top ratings for user with id"""
        print ("Ratings for " + self.userid2name[id])
        ratings = self.data[id]
        print(len(ratings))
        ratings = list(ratings.items())
        ratings = [(self.convertProductID2name(k), v)
                   for (k, v) in ratings]
        # finally sort and return
        ratings.sort(key=lambda artistTuple: artistTuple[1],
                     reverse = True)
        ratings = ratings[:n]
        for rating in ratings:
            print("%s\t%i" % (rating[0], rating[1]))
    def loadBookDB(self, path=''):
        """loads the BX book dataset. Path is where the BX files are
        located"""
        self.data = {}
        i = 0
        #
        # First load book ratings into self.data
        #
        f = codecs.open(path + "BX-Book-Ratings.csv", 'r', 'utf8')
        f.readline()                            #read the title
        for line  in f:
            i += 1
            #separate line into fields
            fields = line.split(';')            # still with ""
            user = fields[0].strip('"')         #delete “ in the fields
            book = fields[1].strip('"')
            rating = int(fields[2].strip().strip('"'))
            if user in self.data:
                currentRatings = self.data[user]
            else:
                currentRatings = {}
            currentRatings[book] = rating
            self.data[user] = currentRatings
            #line = f.readline()
        f.close()
        #
        # Now load books into self.productid2name
        # Books contains isbn, title, and author among other fields
        #
        f = codecs.open(path + "BX-Books.csv", 'r', 'utf8')
        for line in f:
            i += 1
            #separate line into fields
            fields = line.split(';')
            isbn = fields[0].strip('"')
            title = fields[1].strip('"')
            author = fields[2].strip().strip('"')
            title = title + ' by ' + author
            self.productid2name[isbn] = title
        f.close()
        #
        #  Now load user info into both self.userid2name and
        #  self.username2id
        #
        f = codecs.open(path + "BX-Users.csv", 'r', 'utf8')
        for line in f:
            i += 1
            #print(line)
            #separate line into fields
            fields = line.split(';')
            userid = fields[0].strip('"')
            location = fields[1].strip('"')
            if len(fields) > 3:
                age = fields[2].strip().strip('"')
            else:
                age = 'NULL'
            if age != 'NULL':
                value = location + '  (age: ' + age + ')'
            else:
                value = location
            self.userid2name[userid] = value
            self.username2id[location] = userid
        f.close()
        print(i)
    def pearson(self, rating1, rating2):
        sum_xy = 0
        sum_x = 0
        sum_y = 0
        sum_x2 = 0
        sum_y2 = 0
        n = 0
        for key in rating1:
            if key in rating2:
                n += 1
                x = rating1[key]
                y = rating2[key]
                sum_xy += x * y
                sum_x += x
                sum_y += y
                sum_x2 += pow(x, 2)
                sum_y2 += pow(y, 2)
        if n == 0:
            return 0
        # now compute denominator
        denominator = (sqrt(sum_x2 - pow(sum_x, 2) / n)
                       * sqrt(sum_y2 - pow(sum_y, 2) / n))
        if denominator == 0:
            return 0
        else:
            return (sum_xy - (sum_x * sum_y) / n) / denominator
    def computeNearestNeighbor(self, username):
        """creates a sorted list of users based on their distance to
        username"""
        distances = []
        for instance in self.data:
            if instance != username:
                distance = self.fn(self.data[username],
                                   self.data[instance])
                distances.append((instance, distance))
        # sort based on distance -- closest first
        distances.sort(key=lambda artistTuple: artistTuple[1],
                       reverse=True)
        return distances
    def recommend(self, user):
       """Give list of recommendations"""
       recommendations = {}
       # first get list of users  ordered by nearness
       nearest = self.computeNearestNeighbor(user)
       # now get the ratings for the user
       userRatings = self.data[user]
       # determine the total distance
       totalDistance = 0.0
       for i in range(self.k):
          totalDistance += nearest[i][1]
       # now iterate through the k nearest neighbors
       # accumulating their ratings
       for i in range(self.k):
          # compute slice of pie 
          weight = nearest[i][1] / totalDistance
          # get the name of the person
          name = nearest[i][0]
          # get the ratings for this person
          neighborRatings = self.data[name]
          # get the name of the person
          # now find bands neighbor rated that user didn't
          for artist in neighborRatings:
             if not artist in userRatings:
                if artist not in recommendations:
                   recommendations[artist] = (neighborRatings[artist]
                                              * weight)
                else:
                   recommendations[artist] = (recommendations[artist]
                                              + neighborRatings[artist]
                                              * weight)
       # now make list from dictionary
       recommendations = list(recommendations.items())
       recommendations = [(self.convertProductID2name(k), v)
                          for (k, v) in recommendations]
       # finally sort and return
       recommendations.sort(key=lambda artistTuple: artistTuple[1],
                            reverse = True)
       # Return the first n items
       return recommendations[:self.n]
############test code############
#r = recommender(users)
#r.recommend('Jordyn')
#r.recommend('Hailey')
#r.loadBookDB('BX-CSV-Dump/')
#r.recommend('171118')
#r.userRatings('171118', 5)

协同过滤--基于物品的过滤

显示评级：显示给出评级结果，如Youtube的点赞、点差按钮

隐式评级：网站点击轨迹。

基于邻居(用户)的推荐系统计算的次数十分巨大，所以有延迟性。还有稀疏性的问题。也称为基于内存的协同过滤，因为需要保存所有的评级结果来进行推荐。

基于物品的过滤：事先找到最相似的物品，并结合物品的评级结果生成推荐。也称为基于模型的协同过滤，因为不需要保存所有的评级结果，取而代之的随时构建一个模型表示物品之间的相似度。

为了抵消分数夸大，调整余弦相似度

$s(i, j)=\frac {\sum_{u\in U}(R_{u.i}-\bar R_u)(R_{u.j}-\bar R_u)} {\sqrt{\sum_{u\in U}(R_{u.i}-\bar R_u)^2} \sqrt{\sum_{u\in U}(R_{u.j}-\bar R_u)^2}}$
U表示所有同事对i和j进行过评级的用户组合，

$R_{u,i}$ 表示用户u对物品i的评分，

$\bar R_u$ 表示用户u对所有物品评分的平均值。可以获得相似度矩阵。

users3 = {"David": {"Imagine Dragons": 3, "Daft Punk": 5,
                    "Lorde": 4, "Fall Out Boy": 1},
          "Matt":  {"Imagine Dragons": 3, "Daft Punk": 4,
                    "Lorde": 4, "Fall Out Boy": 1},
          "Ben":   {"Kacey Musgraves": 4, "Imagine Dragons": 3,
                    "Lorde": 3, "Fall Out Boy": 1},
          "Chris": {"Kacey Musgraves": 4, "Imagine Dragons": 4,
                    "Daft Punk": 4, "Lorde": 3, "Fall Out Boy": 1},
          "Tori":  {"Kacey Musgraves": 5, "Imagine Dragons": 4,
                    "Daft Punk": 5, "Fall Out Boy": 3}}
def computeSimilarity(band1, band2, userRatings):
   averages = {}
   for (key, ratings) in userRatings.items():
      averages[key] = (float(sum(ratings.values()))
                      / len(ratings.values()))
   num = 0  # numerator
   dem1 = 0 # first half of denominator
   dem2 = 0
   for (user, ratings) in userRatings.items():
      if band1 in ratings and band2 in ratings:
         avg = averages[user]
         num += (ratings[band1] - avg) * (ratings[band2] - avg)
         dem1 += (ratings[band1] - avg)**2
         dem2 += (ratings[band2] - avg)**2
   return num / (sqrt(dem1) * sqrt(dem2))

相似矩阵预测：

$p(u, i)= \frac {\sum_{N \in similarTo(i)}S_{i, N}\times R_{u, N}} {\sum_{N \in similarTo(i)}(|S_{i, N}|) }$

$p(u,i)$ 表示用户u对物品i的预测值

N表示用户u的所有评级物品中每个和i得分相似的物品。

$S_{i, N}$ 是i和N之间的相识度

$R_{i,N}$ 是u给N的评级结果，应该在 $[-1, 1]$ 之间取值，可能需要做线性变换

$NR_{u,N}=\frac {2(R_{u, N}-Min_R)-(Max_R-Min_R)} {Max_R- Min_R}$
得到新的评级结果为

$NR_{u,N}$

Slope One算法

计算偏差

物品i到物品j的平均偏差为

$dev_{i,j}=\sum _{u \in S_{i, j}}{ \frac{u_i-u_j}{card(S_{i, j}(X))} }$

$card(S)$ 是S集合中的元素的个数。X是整个评分集合。

$card(S_{i,j}(X))$ 是所有对i和j进行评分的用户集合。

def computeDeviations(self):
   # for each person in the data:
   #    get their ratings
   for ratings in self.data.values():        # data:users2, ratings:{song:value, , }
       # for each item & rating in that set of ratings:
       for (item, rating) in ratings.items():
           self.frequencies.setdefault(item, {})   #key is song
           self.deviations.setdefault(item, {})                    
           # for each item2 & rating2 in that set of ratings:
           for (item2, rating2) in ratings.items():
               if item != item2:
                   # add the difference between the ratings to our
                   # computation
                   self.frequencies[item].setdefault(item2, 0)
                   self.deviations[item].setdefault(item2, 0.0)
                   # frequemcies is card
                   self.frequencies[item][item2] += 1    
                   # diviations is the sum of dev of diff users
                   #value of complex dic is dev
                   self.deviations[item][item2] += rating - rating2     
                   for (item, ratings) in self.deviations.items():
                       for item2 in ratings:
                           ratings[item2] /= self.frequencies[item][item2]
# test code for ComputeDeviations(self)
#r = recommender(users2)
#r.computeDeviations()
#r.deviations

加权Slope预测

$P^{wSl} (u)_j = \frac{\sum_{i \in S(u)-j} (dev_{j,i} +u_i)c_{j,i}} {\sum_{i \in S(u)-j} c_{j,i}}\\ c_{j, i}=card(S_{j,i}(\chi))$

$P^{wSl} (u)_j$ 表示加权Slope算法给出的用户u对物品j的预测

def slopeOneRecommendations(self, userRatings):
   recommendations = {}
   frequencies = {}
   # for every item and rating in the user's recommendations
   for (userItem, userRating) in userRatings.items():        # userItem :i
       # for every item in our dataset that the user didn't rate
       for (diffItem, diffRatings) in self.deviations.items():    #diffItem : j
           if diffItem not in userRatings and \
           userItem in self.deviations[diffItem]:
               freq = self.frequencies[diffItem][userItem] #freq:c_ji
               # 如果键不存在于字典中，将会添加键并将值设为默认值。
               recommendations.setdefault(diffItem, 0.0)
               frequencies.setdefault(diffItem, 0)
               # add to the running sum representing the numerator
               # of the formula
               recommendations[diffItem] += (diffRatings[userItem] +
                                             userRating) * freq
               # keep a running sum of the frequency of diffitem
               frequencies[diffItem] += freq
               #p(u)j list
               recommendations =  [(self.convertProductID2name(k),          
                                    v / frequencies[k])
                                   for (k, v) in recommendations.items()]
               # finally sort and return
               recommendations.sort(key=lambda artistTuple: artistTuple[1],
                                    reverse = True)
               # I am only going to return the first 50 recommendations
               return recommendations[:50]
# test code for SlopeOneRecommendations
#r = recommender(users2)
#r.computeDeviations()
#g = users2['Ben']
#r.slopeOneRecommendations(g)

def loadMovieLens(self, path=''):
     self.data = {}
     #
     # first load movie ratings
     #
     i = 0
     #
     # First load book ratings into self.data
     #
     #f = codecs.open(path + "u.data", 'r', 'utf8')
     f = codecs.open(path + "u.data", 'r', 'ascii')
     #  f = open(path + "u.data")
     for line in f:
        i += 1
        #separate line into fields
        fields = line.split('\t')
        user = fields[0]
        movie = fields[1]
        rating = int(fields[2].strip().strip('"'))
        if user in self.data:
           currentRatings = self.data[user]
        else:
           currentRatings = {}
        currentRatings[movie] = rating
        self.data[user] = currentRatings
     f.close()
     #
     # Now load movie into self.productid2name
     # the file u.item contains movie id, title, release date among
     # other fields
     #
     #f = codecs.open(path + "u.item", 'r', 'utf8')
     f = codecs.open(path + "u.item", 'r', 'iso8859-1', 'ignore')
     #f = open(path + "u.item")
     for line in f:
        i += 1
        #separate line into fields
        fields = line.split('|')
        mid = fields[0].strip()
        title = fields[1].strip()
        self.productid2name[mid] = title
     f.close()
     #
     #  Now load user info into both self.userid2name
     #  and self.username2id
     #
     #f = codecs.open(path + "u.user", 'r', 'utf8')
     f = open(path + "u.user")
     for line in f:
        i += 1
        fields = line.split('|')
        userid = fields[0].strip('"')
        self.userid2name[userid] = line
        self.username2id[line] = userid
     f.close()
     print(i)
# test code
#r = recommender(0)
#r.loadMovieLens('ml-100k/')
#r.computeDeviations()
#r.slopeOneRecommendations(r.data['1'])
#r.slopeOneRecommendations(r.data['25'])

内容过滤及分类--基于物品属性的过滤

新物品加入，会因为没有被评分过，永远不会被推荐。Pandora是基于基于一种称为音乐基因的项目。

当所用数据挖掘方法基于特征的值来计算两个对象的距离，且不同特征的尺度不同，就需要使用归一化。一般使用均值和标准差来进行归一化，但这种方法可能会受到离群点的影响，所以引入改进后的归一化：均值用中位数( $\mu$ )代替，标准差用绝对标准差(asd)代替。

$asd = \frac{1}{card(x)} \sum_i |x_i-\mu|$

# 计算中位数和绝对标准差
def getMedian(self, alist):
     """return median of alist"""
    if alist == []:
        return []
    blist = sorted(alist)
    length = len(alist)
    if length % 2 == 1:
        # length of list is odd so return middle element
        return blist[int(((length + 1) / 2) -  1)]
    else:
        # length of list is even so compute midpoint
        v1 = blist[int(length / 2)]
        v2 =blist[(int(length / 2) - 1)]
        return (v1 + v2) / 2.0
def getAbsoluteStandardDeviation(self, alist, median):
    """given alist and median return absolute standard deviation"""
    sum = 0
    for item in alist:
        sum += abs(item - median)
    return sum / len(alist)
def unitTest():
    list1 = [54, 72, 78, 49, 65, 63, 75, 67, 54]
    list2 = [54, 72, 78, 49, 65, 63, 75, 67, 54, 68]
    list3 = [69]
    list4 = [69, 72]
    classifier = Classifier('data/athletesTrainingSet.txt')
    m1 = classifier.getMedian(list1)
    m2 = classifier.getMedian(list2)
    m3 = classifier.getMedian(list3)
    m4 = classifier.getMedian(list4)
    asd1 = classifier.getAbsoluteStandardDeviation(list1, m1)
    asd2 = classifier.getAbsoluteStandardDeviation(list2, m2)
    asd3 = classifier.getAbsoluteStandardDeviation(list3, m3)
    asd4 = classifier.getAbsoluteStandardDeviation(list4, m4)
    assert(round(m1, 3) == 65)
    assert(round(m2, 3) == 66)
    assert(round(m3, 3) == 69)
    assert(round(m4, 3) == 70.5)
    assert(round(asd1, 3) == 8)
    assert(round(asd2, 3) == 7.5)
    assert(round(asd3, 3) == 0)
    assert(round(asd4, 3) == 1.5)
    print("getMedian and getAbsoluteStandardDeviation work correctly")

assert语句用于软件组件测试的做法是一种常用的技术。产品每一部分分成一段实现代码加上对实现代码的测试代码，这一点十分重要。

# 归一化
def normalizeColumn(self, columnNumber):
       """given a column number, normalize that column in self.data"""
       # first extract values to list, v is vector, clounm is 0/1,col is a list
       col = [v[1][columnNumber] for v in self.data]
       median = self.getMedian(col)
       asd = self.getAbsoluteStandardDeviation(col, median)
       #print("Median: %f   ASD = %f" % (median, asd))
       self.medianAndDeviation.append((median, asd))
       for v in self.data:
           v[1][columnNumber] = (v[1][columnNumber] - median) / asd

算法评估及kNN

10-flod Cross Validation：将数据集分为10份，使用其中9份进行训练，另外1份用作测试，重复该过程10次。

留一法：n-flod Cross Validation。结果是随机的，不是确定值，和数据的划分有关。缺点在于计算机开销很大。分层采样的时候保证样本的均匀性很重要。

混淆矩阵：行表示测试样本的真实类别，列表示预测器所预测出来的类别。可揭示分类器性能。

# divide data into 10 buckets
import random
def buckets(filename, bucketName, separator, classColumn):
    """the original data is in the file named filename
    bucketName is the prefix for all the bucket names
    separator is the character that divides the columns
    (for ex., a tab or comma and classColumn is the column
    that indicates the class"""
    # put the data in 10 buckets
    numberOfBuckets = 10
    data = {}
    # first read in the data and divide by category
    with open(filename) as f:
        lines = f.readlines()
    for line in lines:
        if separator != '\t':
            line = line.replace(separator, '\t')
        # first get the category
        category = line.split()[classColumn]
        data.setdefault(category, [])   #set the value for dic data
        data[category].append(line)     #all the information 
    # initialize the buckets [[], [], ...]
    buckets = []
    for i in range(numberOfBuckets):
        buckets.append([])       
    # now for each category put the data into the buckets
    for k in data.keys():
        #randomize order of instances for each class
        #data[k] is a list of line
        random.shuffle(data[k])
        bNum = 0
        # divide into buckets
        for item in data[k]:
            buckets[bNum].append(item)
            bNum = (bNum + 1) % numberOfBuckets
    # write to file
    for bNum in range(numberOfBuckets):
        f = open("%s-%02i" % ('tmp/'+bucketName, bNum + 1), 'w')
        for item in buckets[bNum]:
            f.write(item)
        f.close()
# example of how to use this code          
buckets("data/mpgData.txt", 'mpgData',',',0)

分类器评价：Kappa统计量。相对于随机分类器而言的分类器效果。

$\kappa =\frac{P(c)-P(r)}{1-P(r)}$

$P(c)$ 是实际分类器的准确率，

$P(r)$ 是随机分类器的精确率。

Kappa区间	性能
<0	比随机方法性能差
0.01-0.2	轻微一致
0.21-0.4	一般一致
0.41-0.6	中度一致
0.61-0.8	高度一致
0.81-1	接近完美

KNN：当有一个样本是比较特别的时候，使用最近邻可能会导致特别样本的存在而出现误分类。改进的办法就是考察k个邻居。离得越近，影响因子就越大。影响因子可以用距离的倒数来表示。

def knn(self, itemVector):
  """returns the predicted class of itemVector using k
  Nearest Neighbors"""
  # changed from min to heapq.nsmallest to get the
  # k closest neighbors
  neighbors = heapq.nsmallest(self.k,
  [(self.manhattan(itemVector, item[1]), item)
  for item in self.data])
  # each neighbor gets a vote
  results = {}
  for neighbor in neighbors: 
  theClass = neighbor[1][0]
  results.setdefault(theClass, 0)
  results[theClass] += 1
  resultList = sorted([(i[1], i[0]) for i in results.items()], reverse=True)
  #get all the classes that have the maximum votes
  maxVotes = resultList[0][0]
  possibleAnswers = [i[1] for i in resultList if i[0] == maxVotes]
  # randomly select one of the classes that received the max votes
  answer = random.choice(possibleAnswers)
  return( answer)

做工程，数据量大的时候算法的效果越好。做论文还是要研究出一个具有少量性能提高的算法。

概率及朴素贝叶斯

特点：分类并给出概率。

先验概率：P(h)

后验概率/条件概率：P(h/d)

$h_{MAP}=argmax_{h\in H}P(D|h)P(h)$

# 训练
class Classifier:
    def __init__(self, bucketPrefix, testBucketNumber, dataFormat):
        """ a classifier will be built from files with the bucketPrefix
        excluding the file with textBucketNumber. dataFormat is a string that
        describes how to interpret each line of the data files. For example,
        for the iHealth data the format is:
        "attr   attr    attr    attr    class"
        """
        total = 0
        classes = {}
        counts = {}
        # reading the data in from the file
        self.format = dataFormat.strip().split('\t')
        self.prior = {}
        self.conditional = {}
        # for each of the buckets numbered 1 through 10:
        for i in range(1, 11):
            # if it is not the bucket we should ignore, read in the data
            if i != testBucketNumber:
                filename = "%s-%02i" % (bucketPrefix, i)
                f = open(filename)
                lines = f.readlines()
                f.close()
                for line in lines:
                    fields = line.strip().split('\t')
                    ignore = []
                    vector = []
                    for i in range(len(fields)):
                        if self.format[i] == 'num':
                            vector.append(float(fields[i]))     #vector!!
                        elif self.format[i] == 'attr':
                            vector.append(fields[i])                           
                        elif self.format[i] == 'comment':
                            ignore.append(fields[i])
                        elif self.format[i] == 'class':
                            category = fields[i]
                    # now process this instance
                    total += 1
                    classes.setdefault(category, 0)     #字典：分类类别计数
                    counts.setdefault(category, {})     #复合字典：每类的每列的具体计数
                    classes[category] += 1
                    # now process each attribute of the instance
                    col = 0
                    for columnValue in vector:
                        col += 1
                        counts[category].setdefault(col, {})
                        counts[category][col].setdefault(columnValue, 0)
                        counts[category][col][columnValue] += 1
        # ok done counting. now compute probabilities
        # first prior probabilities p(h)
        for (category, count) in classes.items():
            self.prior[category] = count / total#字典：先验概率
        # now compute conditional probabilities p(D|h)
        for (category, columns) in counts.items():
              self.conditional.setdefault(category, {})
              for (col, valueCounts) in columns.items():
                  self.conditional[category].setdefault(col, {})
                  for (attrValue, count) in valueCounts.items():
                      self.conditional[category][col][attrValue] = (
                          count / classes[category])        #复合字典：每类的每个属性的条件概率
        self.tmp =  counts               #应该暂时没有用

# 分类
    def classify(self, itemVector):
        """Return class we think item Vector is in"""
        results = []
        for (category, prior) in self.prior.items():
            prob = prior
            col = 1
            for attrValue in itemVector:
                if not attrValue in self.conditional[category][col]:
                    # we did not find any instances of this attribute value
                    # occurring with this category so prob = 0
                    prob = 0
                else:
                    prob = prob * self.conditional[category][col][attrValue]
                col += 1
            results.append((prob, category))
        # return the category with the highest probability
        return(max(results)[1])

# test code
c = Classifier("iHealth/i", 10,"attr\tattr\tattr\tattr\tclass")
print(c.classify(['health', 'moderate', 'moderate', 'yes']))

问题：当存在某个概率为0时，直接主导整个贝叶斯的计算过程，即使其他的独立事件的条件概率接近于1。此外，基于样本集估计出来概率往往是真实概率的偏低估计。

改进：将

$P(x|y)=\frac{n_c}{n}$
修改为

$P(x|y)=\frac{n_c+mp}{n+m}$
其中

$n$ 是y事件总数，

$n_c$ 是y中x事件总数，

$m$ 是等效样本容量，通常的确定方法是：m为可选属性的个数值，p是可选属性的概率的先验估计，通常假设均匀分布有

$1/m$

当处理的数据是连续的时候，有两种解决办法。一是离散化，构建类别；一是假设概率分布服从高斯分布，然后计算概率。

样本标准差：

$sd= \sqrt{\frac{\sum_i(x_i-\bar x)^2}{card(x)-1}}$
对于样本集而言，样本标准差相对于总体标准差计算公式是总体标准差的更优估计。

# pdf计算实现
def pdf(mean, ssd, x):
   """Probability Density Function  computing P(x|y)
   input is the mean, sample standard deviation for all the items in y,
   and x."""
   ePart = math.pow(math.e, -(x-mean)**2/(2*ssd**2))
   print (ePart)
   return (1.0 / (math.sqrt(2*math.pi)*ssd)) * ePart
````
<div class="md-section-divider"></div>
```python
# 连续数据的训练
class Classifier:
    def __init__(self, bucketPrefix, testBucketNumber, dataFormat):
        """ a classifier will be built from files with the bucketPrefix
        excluding the file with textBucketNumber. dataFormat is a string that
        describes how to interpret each line of the data files. For example,
        for the iHealth data the format is:
        "attr   attr    attr    attr    class"
        """
        total = 0
        classes = {}
        # counts used for attributes that are not numeric
        counts = {}
        # totals used for attributes that are numereric
        # we will use these to compute the mean and sample standard deviation for
        # each attribute - class pair.
        totals = {}
        numericValues = {}
        # reading the data in from the file
        self.format = dataFormat.strip().split('\t')
        # 
        self.prior = {}
        self.conditional = {}
        # for each of the buckets numbered 1 through 10:
        for i in range(1, 11):
            # if it is not the bucket we should ignore, read in the data
            if i != testBucketNumber:
                filename = "%s-%02i" % (bucketPrefix, i)
                f = open(filename)
                lines = f.readlines()
                f.close()
                for line in lines:
                    fields = line.strip().split('\t')
                    ignore = []
                    vector = []
                    nums = []
                    for i in range(len(fields)):
                        if self.format[i] == 'num':
                            nums.append(float(fields[i]))
                        elif self.format[i] == 'attr':
                            vector.append(fields[i])                           
                        elif self.format[i] == 'comment':
                            ignore.append(fields[i])
                        elif self.format[i] == 'class':
                            category = fields[i]
                    # now process this instance
                    total += 1
                    classes.setdefault(category, 0)
                    counts.setdefault(category, {})
                    totals.setdefault(category, {})
                    numericValues.setdefault(category, {})
                    classes[category] += 1
                    # now process each non-numeric attribute of the instance
                    col = 0
                    for columnValue in vector:
                        col += 1
                        counts[category].setdefault(col, {})
                        counts[category][col].setdefault(columnValue, 0)
                        counts[category][col][columnValue] += 1
                    # process numeric attributes
                    col = 0
                    for columnValue in nums:
                        col += 1
                        totals[category].setdefault(col, 0)
                        #totals[category][col].setdefault(columnValue, 0)
                        totals[category][col] += columnValue
                        numericValues[category].setdefault(col, [])
                        numericValues[category][col].append(columnValue)
        #
        # ok done counting. now compute probabilities
        #
        # first prior probabilities p(h)
        #
        for (category, count) in classes.items():
            self.prior[category] = count / total
        #
        # now compute conditional probabilities p(h|D)
        #
        for (category, columns) in counts.items():
              self.conditional.setdefault(category, {})
              for (col, valueCounts) in columns.items():
                  self.conditional[category].setdefault(col, {})
                  for (attrValue, count) in valueCounts.items():
                      self.conditional[category][col][attrValue] = (
                          count / classes[category])
        self.tmp =  counts               
        #
        # now compute mean and sample standard deviation
        #
        self.means = {}
        self.totals = totals
        for (category, columns) in totals.items():
            self.means.setdefault(category, {})
            for (col, cTotal) in columns.items():
                self.means[category][col] = cTotal / classes[category]
        # standard deviation
        self.ssd = {}
        for (category, columns) in numericValues.items():
            self.ssd.setdefault(category, {})
            for (col, values) in columns.items():
                SumOfSquareDifferences = 0
                theMean = self.means[category][col]
                for value in values:
                    SumOfSquareDifferences += (value - theMean)**2
                columns[col] = 0
                self.ssd[category][col] = math.sqrt(SumOfSquareDifferences / (classes[category]  - 1))

# 连续数据的分类
    def classify(self, itemVector, numVector):
        """Return class we think item Vector is in"""
        results = []
        sqrt2pi = math.sqrt(2 * math.pi)
        for (category, prior) in self.prior.items():
            prob = prior
            col = 1
            for attrValue in itemVector:
                if not attrValue in self.conditional[category][col]:
                    # we did not find any instances of this attribute value
                    # occurring with this category so prob = 0
                    prob = 0
                else:
                    prob = prob * self.conditional[category][col][attrValue]
                col += 1
            col = 1
            for x in  numVector:
                mean = self.means[category][col]
                ssd = self.ssd[category][col]
                ePart = math.pow(math.e, -(x - mean)**2/(2*ssd**2))
                prob = prob * ((1.0 / (sqrt2pi*ssd)) * ePart)
                col += 1
            results.append((prob, category))
        # return the category with the highest probability
        #print(results)
        return(max(results)[1])

贝叶斯和kNN的比较

贝叶斯优点：实现简单，和其他方法相比需要的训练数据更少
贝叶斯缺点：不能学习到特征之间的相互作用。
kNN优点：实现简单，不用考虑数据特定的结构，需要大量的内存来存储训练集
kNN缺点：训练集很大的时候是一个合理的选择。

许多真实数据挖掘问题中，很多属性不是独立的。有时候可以假设独立。之所以称朴素贝叶斯是因为尽管知道不成立仍然假设属性之间是独立的。

朴素贝叶斯和文本

训练阶段：

将标识为同一假设的文档合并成一个文本文件
计算词在该文件中的出现次数n,形成一个词汇表
对于词汇表中的每个词 $w_k$ 计算器在文本中的出现次数，记为 $n_k$
对词汇表中的每个词(去除停用词) $w_k$ ，计算

$P(w_k|h_i)=\frac{n_k+1}{n+|Vocabulary|}$

class BayesText:
    def __init__(self, trainingdir, stopwordlist):
        """This class implements a naive Bayes approach to text
        classification
        trainingdir is the training data. Each subdirectory of
        trainingdir is titled with the name of the classification
        category -- those subdirectories in turn contain the text
        files for that category.
        The stopwordlist is a list of words (one per line) will be
        removed before any counting takes place.
        """
        self.vocabulary = {}
        self.prob = {}
        self.totals = {}
        self.stopwords = {}         #停用词字典
        f = open(stopwordlist)
        for line in f:
            self.stopwords[line.strip()] = 1
        f.close()
        categories = os.listdir(trainingdir)
        #filter out files that are not directories
        self.categories = [filename for filename in categories
                           if os.path.isdir(trainingdir + filename)]
        print("Counting ...")
        for category in self.categories:
            print('    ' + category)
            # 计算当前类别的单词和单词数量，单词的总量
            (self.prob[category],
             self.totals[category]) = self.train(trainingdir, category)
        # I am going to eliminate any word in the 所有种类的单词库vocabulary
        # that doesn't occur at least 3 times
        toDelete = []
        for word in self.vocabulary:
            if self.vocabulary[word] < 3:
                # mark word for deletion
                # can't delete now because you can't delete
                # from a list you are currently iterating over
                toDelete.append(word)
        # now delete
        for word in toDelete:
            del self.vocabulary[word]
        # now compute probabilities
        vocabLength = len(self.vocabulary)
        print("Computing probabilities:")
        for category in self.categories:
            print('    ' + category)
            denominator = self.totals[category] + vocabLength
            for word in self.vocabulary:
                if word in self.prob[category]:
                    count = self.prob[category][word]
                else:
                    count = 1
                # 条件概率计算
                self.prob[category][word] = (float(count + 1)
                                             / denominator)
        print ("DONE TRAINING\n\n")
    # input：trainingdir训练文件的目录, category训练文件的种类
    # return: (counts, total) (当前文件的单词和单词数量,所有单词的数量)
    def train(self, trainingdir, category):
        """counts word occurrences for a particular category"""
        currentdir = trainingdir + category
        files = os.listdir(currentdir)
        counts = {}
        total = 0
        for file in files:
            #print(currentdir + '/' + file)
            f = codecs.open(currentdir + '/' + file, 'r', 'iso8859-1')
            for line in f:
                tokens = line.split()
                for token in tokens:
                    # get rid of punctuation and lowercase token
                    token = token.strip('\'".,?:-')
                    token = token.lower()
                    if token != '' and not token in self.stopwords:
                        self.vocabulary.setdefault(token, 0)
                        self.vocabulary[token] += 1#所有文档的单词和单词数量
                        counts.setdefault(token, 0)
                        counts[token] += 1#当前文件的单词和单词数量
                        total += 1#所有单词的数量
            f.close()
        return(counts, total)
# test code
bT = BayesText(trainingDir, stoplistfile)
bT.prob['rec.motorcycles']["god"]

分类阶段：

$h_{MAP}=argmax_{h\in H}P(D|h)P(h)$
如果概率非常小，Python无法计算，可以采用取对数的形式。

停用词：当停用词是噪声时，去掉这些词能减少处理量，提高性能。个别情况下要重新考虑停用词。如性犯罪者会比一般人更多使用me、you这类词语。

    def classify(self, itemVector, numVector):
        """Return class we think item Vector is in"""
        results = []
        sqrt2pi = math.sqrt(2 * math.pi)
        for (category, prior) in self.prior.items():
            prob = prior
            col = 1
            for attrValue in itemVector:
                if not attrValue in self.conditional[category][col]:
                    # we did not find any instances of this attribute value
                    # occurring with this category so prob = 0
                    prob = 0
                else:
                    prob = prob * self.conditional[category][col][attrValue]
                col += 1
            col = 1
            for x in  numVector:
                mean = self.means[category][col]
                ssd = self.ssd[category][col]
                ePart = math.pow(math.e, -(x - mean)**2/(2*ssd**2))
                prob = prob * ((1.0 / (sqrt2pi*ssd)) * ePart)
                col += 1
            results.append((prob, category))
        # return the category with the highest probability
        #print(results)
        return(max(results)[1])
# test code
bT.classify(testDir+ 'rec.motorcycles/104673')

10-fold cross

from __future__ import print_function
import os, codecs, math
class BayesText:
    # input:训练文件目录，停用词，忽略的文件子集
    def __init__(self, trainingdir, stopwordlist, ignoreBucket):
        """This class implements a naive Bayes approach to text
        classification
        trainingdir is the training data. Each subdirectory of
        trainingdir is titled with the name of the classification
        category -- those subdirectories in turn contain the text
        files for that category.
        The stopwordlist is a list of words (one per line) will be
        removed before any counting takes place.
        """
        self.vocabulary = {}
        self.prob = {}
        self.totals = {}
        self.stopwords = {}
        f = open(stopwordlist)
        for line in f:
            self.stopwords[line.strip()] = 1
        f.close()
        categories = os.listdir(trainingdir)
        #filter out files that are not directories，in this program, neg and pos
        self.categories = [filename for filename in categories
                           if os.path.isdir(trainingdir + filename)]
        print("Counting ...")
        for category in self.categories:
            #print('    ' + category)
            (self.prob[category],
             self.totals[category]) = self.train(trainingdir, category,
                                                 ignoreBucket)
        # I am going to eliminate any word in the vocabulary
        # that doesn't occur at least 3 times
        toDelete = []
        for word in self.vocabulary:
            if self.vocabulary[word] < 3:
                # mark word for deletion
                # can't delete now because you can't delete
                # from a list you are currently iterating over
                toDelete.append(word)
        # now delete
        for word in toDelete:
            del self.vocabulary[word]
        # now compute probabilities
        vocabLength = len(self.vocabulary)
        #print("Computing probabilities:")
        for category in self.categories:
            #print('    ' + category)
            denominator = self.totals[category] + vocabLength
            for word in self.vocabulary:
                if word in self.prob[category]:
                    count = self.prob[category][word]
                else:
                    count = 1
                self.prob[category][word] = (float(count + 1)
                                             / denominator)
        #print ("DONE TRAINING\n\n")
    def train(self, trainingdir, category, bucketNumberToIgnore):
        """counts word occurrences for a particular category"""
        ignore = "%i" % bucketNumberToIgnore
        currentdir = trainingdir + category
        directories = os.listdir(currentdir)
        counts = {}
        total = 0
        for directory in directories:
            if directory != ignore:
                currentBucket = trainingdir + category + "/" + directory
                files = os.listdir(currentBucket)
                #print("   " + currentBucket)
                for file in files:
                    f = codecs.open(currentBucket + '/' + file, 'r', 'iso8859-1')
                    for line in f:
                        tokens = line.split()
                        for token in tokens:
                            # get rid of punctuation and lowercase token
                            token = token.strip('\'".,?:-')
                            token = token.lower()
                            if token != '' and not token in self.stopwords:
                                self.vocabulary.setdefault(token, 0)
                                self.vocabulary[token] += 1
                                counts.setdefault(token, 0)
                                counts[token] += 1
                                total += 1
                    f.close()
        return(counts, total)
    def classify(self, filename):
        results = {}
        for category in self.categories:
            results[category] = 0
        f = codecs.open(filename, 'r', 'iso8859-1')
        for line in f:
            tokens = line.split()
            for token in tokens:
                #print(token)
                token = token.strip('\'".,?:-').lower()
                if token in self.vocabulary:
                    for category in self.categories:
                        if self.prob[category][token] == 0:
                            print("%s %s" % (category, token))
                        results[category] += math.log(
                            self.prob[category][token])
        f.close()
        results = list(results.items())
        results.sort(key=lambda tuple: tuple[1], reverse = True)
        # for debugging I can change this to give me the entire list
        return results[0][0]
    # input: 测试文件的分类目录，当前类别， 忽略子集
    # return: 当前类别下的分类结果{0:12,1：23}
    def testCategory(self, direc, category, bucketNumber):
        results = {}
        directory = direc + ("%i/" % bucketNumber)
        #print("Testing " + directory)
        files = os.listdir(directory)
        total = 0
        #correct = 0
        for file in files:
            total += 1
            result = self.classify(directory + file)
            results.setdefault(result, 0)
            results[result] += 1
            #if result == category:
            #               correct += 1
        return results
    # input: 测试文件目录， 忽略的子集文件
    # return: 所有类别的分类结果{1:{0:12,1：23},}
    def test(self, testdir, bucketNumber):
        """Test all files in the test directory--that directory is
        organized into subdirectories--each subdir is a classification
        category"""
        results = {}
        categories = os.listdir(testdir)
        #filter out files that are not directories
        categories = [filename for filename in categories if
                      os.path.isdir(testdir + filename)]
        for category in categories:
            #print(".", end="")
            results[category] = self.testCategory(
                testdir + category + '/', category, bucketNumber)
        return results
def tenfold(dataPrefix, stoplist):
    results = {}
    for i in range(0,10):
        bT = BayesText(dataPrefix, stoplist, i)
        r = bT.test(theDir, i)
        for (key, value) in r.items():
            results.setdefault(key, {})
            for (ckey, cvalue) in value.items():
                results[key].setdefault(ckey, 0)
                results[key][ckey] += cvalue
                categories = list(results.keys())
    categories.sort()
    print(   "\n       Classified as: ")
    header =    "          "
    subheader = "        +"
    for category in categories:
        header += "% 2s   " % category
        subheader += "-----+"
    print (header)
    print (subheader)
    total = 0.0
    correct = 0.0
    for category in categories:
        row = " %s    |" % category 
        for c2 in categories:
            if c2 in results[category]:
                count = results[category][c2]
            else:
                count = 0
            row += " %3i |" % count
            total += count
            if c2 == category:
                correct += count
        print(row)
    print(subheader)
    print("\n%5.3f percent correct" %((correct * 100) / total))
    print("total of %i instances" % total)
# change these to match your directory structure
prefixPath = "data/review_polarity/"
theDir = prefixPath + "/txt_sentoken/"
stoplistfile = prefixPath + "stopwords25.txt"
tenfold(theDir, stoplistfile)

聚类--群组发现

层次聚类：每次迭代将最相似的两个簇合并，不断重复直至只有一个簇。两个簇距离计算方法不一样，聚类的方法就不一样，合并的过程也不一样。

单连接聚类：两个簇的距离定义为一个簇的所有成员到另一个簇的所有成员最短的距离
全连接聚类：两个簇的距离定义为一个簇的所有成员到另一个簇的所有成员最长的距离
平均连接聚类：两个簇的距离定义为一个簇的所有成员到另一个簇的所有成员的平均距离

常规队列：先进先出

优先级队列：去除次序基于队列元素的关联权重，数值越小先去除。

# 从队列中获得数据的最小值
from Queue import PriorityQueue
singersQueue = PriorityQueue()
singersQueue.put((16, 'Suzaka'))
singersQueue.put((14, 'Yui'))
singersQueue.put((15, 'Moa'))
print singersQueue.get()
print singersQueue.get()
print singersQueue.get()

class hClusterer:
    """ this clusterer assumes that the first column of the data is a label
    not used in the clustering. The other columns contain numeric data"""
    def __init__(self, filename):
        file = open(filename)
        self.data = {}
        self.counter = 0
        self.queue = PriorityQueue()
        lines = file.readlines()
        file.close()
        header = lines[0].split(',')
        self.cols = len(header)
        self.data = [[] for i in range(len(header))]
        for line in lines[1:]:
            cells = line.split(',')
            toggle = 0
            # self.data [['Border Collie',...], [20.0,...], [45.0,...]]
            for cell in range(self.cols):
                if toggle == 0:
                   self.data[cell].append(cells[cell])
                   toggle = 1
                else:
                    self.data[cell].append(float(cells[cell]))
        # now normalize number columns (that is, skip the first column)
        for i in range(1, self.cols):
                self.data[i] = normalizeColumn(self.data[i])
        ###
        ###  I have read in the data and normalized the 
        ###  columns. Now for each element i in the data, I am going to
        ###     1. compute the Euclidean Distance from element i to all the 
        ###        other elements.  This data will be placed in neighbors,
        ###        which is a Python dictionary. Let's say i = 1, and I am 
        ###        computing the distance to the neighbor j and let's say j 
        ###        is 2. The neighbors dictionary for i will look like
        ###        {2: ((1,2), 1.23),  3: ((1, 3), 2.3)... }
        ###
        ###     2. find the closest neighbor
        ###
        ###     3. place the element on a priority queue, called simply queue,
        ###        based on the distance to the nearest neighbor (and a counter
        ###        used to break ties.
        # now push distances on queue        
        rows = len(self.data[0])              
        for i in range(rows):
            minDistance = 99999
            nearestNeighbor = 0
            neighbors = {}  #存储所有邻居的对和距离
            for j in range(rows):
                if i != j:
                    dist = self.distance(i, j)
                    if i < j:
                        pair = (i,j)
                    else:
                        pair = (j,i)
                    neighbors[j] = (pair, dist)
                    if dist < minDistance:
                        minDistance = dist
                        nearestNeighbor = j
                        # nearestNum = j
            # create nearest Pair
            if i < nearestNeighbor:
                nearestPair = (i, nearestNeighbor)
            else:
                nearestPair = (nearestNeighbor, i)
            # put instance on priority queue    
            self.queue.put((minDistance, self.counter,
                            [[self.data[0][i]], nearestPair, neighbors]))
            self.counter += 1   #存储对象的数量，先后顺序
    def distance(self, i, j):
        sumSquares = 0
        for k in range(1, self.cols):
            sumSquares += (self.data[k][i] - self.data[k][j])**2
        return math.sqrt(sumSquares)
    def cluster(self):
         done = False
         while not done:
             # 依据 minDistance 取出当前最小距离的对象,应该是两对距离。
             topOne = self.queue.get()
             nearestPair = topOne[2][1]
             if not self.queue.empty():
                 nextOne = self.queue.get()
                 nearPair = nextOne[2][1]       #当等距离时，不一定一样的
                 tmp = []
                 ##
                 ##  I have just popped two elements off the queue,
                 ##  topOne and nextOne. I need to check whether nextOne
                 ##  is topOne's nearest neighbor and vice versa.
                 ##  If not, I will pop another element off the queue
                 ##  until I find topOne's nearest neighbor. That is what
                 ##  this while loop does.
                 ## 
                 # 处理等距离的情况
                 while nearPair != nearestPair:
                     tmp.append((nextOne[0], self.counter, nextOne[2]))
                     self.counter += 1
                     nextOne = self.queue.get()
                     nearPair = nextOne[2][1]
                 ##
                 ## this for loop pushes the elements I popped off in the
                 ## above while loop.
                 ## 放回等距离不配对的对象               
                 for item in tmp:
                     self.queue.put(item)
                 if len(topOne[2][0]) == 1:
                    item1 = topOne[2][0][0]
                 else:
                     item1 = topOne[2][0]
                 if len(nextOne[2][0]) == 1:
                    item2 = nextOne[2][0][0]
                 else:
                     item2 = nextOne[2][0]
                 ##  curCluster is, perhaps obviously, the new cluster
                 ##  which combines cluster item1 with cluster item2.
                 curCluster = (item1, item2)
                 ## Now I am doing two things. First, finding the nearest
                 ## neighbor to this new cluster. Second, building a new
                 ## neighbors list by merging the neighbors lists of item1
                 ## and item2. If the distance between item1 and element 23
                 ## is 2 and the distance betweeen item2 and element 23 is 4
                 ## the distance between element 23 and the new cluster will
                 ## be 2 (i.e., the shortest distance).
                 ##
                 minDistance = 99999
                 nearestPair = ()
                 #nearestNeighbor = ''
                 merged = {}
                 nNeighbors = nextOne[2][2] #所有的邻居对和距离
                 for (key, value) in topOne[2][2].items():
                    if key in nNeighbors:           #只剩下最后一个的时候，不成立
                        if nNeighbors[key][1] < value[1]:
                             dist =  nNeighbors[key]
                        else:
                            dist = value
                        if dist[1] < minDistance:
                             minDistance =  dist[1]
                             nearestPair = dist[0]      #这里的对比较特别，只去了其中一个点
                             #nearestNeighbor = key
                        merged[key] = dist
                 if merged == {}:
                    return curCluster#返回元祖对，用来建立树
                 else:
                    self.queue.put( (minDistance, self.counter,
                                     [curCluster, nearestPair, merged]))
                    self.counter += 1

KMeans

这个算法和爬山法类似，不能保证最后能够找到最有的划分簇。因为算法一开始选择的是随机中心点的集合，所以只能确保找到局部最有划分簇。

误差平方和(sum of squared error,简称SSE)：度量某个簇集合的质量。

$SSE = \sum_{i=1}^k\sum_{x\in C_i}dist(c_i, x)^2$

import math
import random 
"""
Implementation of the K-means algorithm
for the book A Programmer's Guide to Data Mining"
http://www.guidetodatamining.com
"""
def getMedian(alist):
    """get median of list"""
    tmp = list(alist)
    tmp.sort()
    alen = len(tmp)
    if (alen % 2) == 1:
        return tmp[alen // 2]
    else:
        return (tmp[alen // 2] + tmp[(alen // 2) - 1]) / 2
def normalizeColumn(column):
    """normalize the values of a column using Modified Standard Score
    that is (each value - median) / (absolute standard deviation)"""
    median = getMedian(column)
    asd = sum([abs(x - median) for x in column]) / len(column)
    result = [(x - median) / asd for x in column]
    return result
class kClusterer:
    """ Implementation of kMeans Clustering
    This clusterer assumes that the first column of the data is a label
    not used in the clustering. The other columns contain numeric data
    """
    def __init__(self, filename, k):
        """ k is the number of clusters to make
        This init method:
           1. reads the data from the file named filename
           2. stores that data by column in self.data
           3. normalizes the data using Modified Standard Score
           4. randomly selects the initial centroids
           5. assigns points to clusters associated with those centroids
        """
        file = open(filename)
        self.data = {}
        self.k = k
        self.counter = 0
        self.iterationNumber = 0
        # used to keep track of % of points that change cluster membership
        # in an iteration
        self.pointsChanged = 0
        # Sum of Squared Error
        self.sse = 0
        #
        # read data from file
        #
        lines = file.readlines()
        file.close()
        header = lines[0].split(',')
        self.cols = len(header)
        self.data = [[] for i in range(len(header))]
        # we are storing the data by column.
        # For example, self.data[0] is the data from column 0.
        # self.data[0][10] is the column 0 value of item 10.
        for line in lines[1:]:
            cells = line.split(',')
            toggle = 0
            for cell in range(self.cols):
                if toggle == 0:
                   self.data[cell].append(cells[cell])
                   toggle = 1
                else:
                    self.data[cell].append(float(cells[cell]))
        self.datasize = len(self.data[1])
        self.memberOf = [-1 for x in range(len(self.data[1]))]
        #
        # now normalize number columns
        #
        for i in range(1, self.cols):
                self.data[i] = normalizeColumn(self.data[i])
        # select random centroids from existing points
        random.seed()
        #sample(seq, n) 从序列seq中选择n个随机且独立的元素
        self.centroids = [[self.data[i][r]  for i in range(1, len(self.data))]
                           for r in random.sample(range(len(self.data[0])),
                                                 self.k)]
        self.assignPointsToCluster()
    def updateCentroids(self):
        """Using the points in the clusters, determine the centroid
        (mean point) of each cluster"""
        members = [self.memberOf.count(i) for i in range(len(self.centroids))]#计数列表
        #对centroid类的数据的第k列数据 第i个中心点求和，计算平均        
        self.centroids = [[sum([self.data[k][i]
                                for i in range(len(self.data[0]))
                                if self.memberOf[i] == centroid])/members[centroid]
                           for k in range(1, len(self.data))]
                          for centroid in range(len(self.centroids))] 
    def assignPointToCluster(self, i):
        """ assign point to cluster based on distance from centroids"""
        min = 999999
        clusterNum = -1
        for centroid in range(self.k):
            dist = self.euclideanDistance(i, centroid)
            if dist < min:
                min = dist
                clusterNum = centroid
        # here is where I will keep track of changing points
        if clusterNum != self.memberOf[i]:      #第一次认为全部变动
            self.pointsChanged += 1
        # add square of distance to running sum of squared error
        self.sse += min**2
        return clusterNum
    def assignPointsToCluster(self):
        """ assign each data point to a cluster"""
        self.pointsChanged = 0
        self.sse = 0
        self.memberOf = [self.assignPointToCluster(i)
                         for i in range(len(self.data[1]))]
    def euclideanDistance(self, i, j):
        """ compute distance of point i from centroid j"""
        sumSquares = 0
        for k in range(1, self.cols):
            sumSquares += (self.data[k][i] - self.centroids[j][k-1])**2
        return math.sqrt(sumSquares)
    def kCluster(self):
        """the method that actually performs the clustering
        As you can see this method repeatedly
            updates the centroids by computing the mean point of each cluster
            re-assign the points to clusters based on these new centroids
        until the number of points that change cluster membership is less than 1%.
        """
        done = False
        while not done:
            self.iterationNumber += 1
            self.updateCentroids()
            self.assignPointsToCluster()
            #
            # we are done if fewer than 1% of the points change clusters
            #
            if float(self.pointsChanged) / len(self.memberOf) <  0.01:
                done = True
        print("Final SSE: %f" % self.sse)
    def showMembers(self):
        """Display the results"""
        for centroid in range(len(self.centroids)):
             print ("\n\nClass %i\n========" % centroid)
             for name in [self.data[0][i]  for i in range(len(self.data[0]))
                          if self.memberOf[i] == centroid]:
                 print (name)
##
## RUN THE K-MEANS CLUSTERER ON THE DOG DATA USING K = 3
###
# change the path in the following to match where dogs.csv is on your machine
km = kClusterer('data/dogs.csv', 3)
km.kCluster()
km.showMembers()

K-means++通过修改初始中心点的选择方法改进了由于初始的随机性导致的结果非优的缺点。虽然第一个点选择是随机的，但其他的点则优先选择那些彼此距离很远的点。

选择初始中心点的步骤：

从数据点中随机选择第一个中心点
重复以下过程，直到选出k个中心点
- 计算每个数据点dp到其最近的中心点的距离D(dp)
- 以正比D(dp)的概率，随机选择一个数据点作为新中心点加入到中心点集合

    def selectInitialCentroids(self):
        """implement the k-means++ method of selecting
        the set of initial centroids"""
        centroids = []
        total = 0
        # first step is to select a random first centroid
        current = random.choice(range(len(self.data[0])))
        centroids.append(current)
        # loop to select the rest of the centroids, one at a time
        for i in range(0, self.k - 1):
            # for every point in the data find its distance to
            # the closest centroid
            weights = [self.distanceToClosestCentroid(x, centroids) 
                       for x in range(len(self.data[0]))]
            total = sum(weights)
            # instead of raw distances, convert so sum of weight = 1
            weights = [x / total for x in weights]
            #
            # now roll virtual die
            num = random.random()
            total = 0
            x = -1
            # the roulette wheel simulation
            while total < num:
                x += 1
                total += weights[x]
            centroids.append(x)
        self.centroids = [[self.data[i][r]  for i in range(1, len(self.data))]
                            for r in centroids]
    def distanceToClosestCentroid(self, point, centroidList):
        result = self.eDistance(point, centroidList[0])
        for centroid in centroidList[1:]:
            distance = self.eDistance(point, centroid)
            if distance < result:
                result = distance
        return result
    def eDistance(self, i, j):
        """ compute distance of point i from centroid j"""
        sumSquares = 0
        for k in range(1, self.cols):
            sumSquares += (self.data[k][i] - self.data[k][j])**2
        return math.sqrt(sumSquares)