Machine Learning--FP-growth algorithm

FP-growth algorithm


1. Principles

  • Compared with the Apriori algorithm, the FP-growth algorithm finds frequent itemsets faster.
  • The FP-growth algorithm stores data in a compact data structure of the FP tree.Unlike the search tree, an element can appear multiple times in the FP tree.The FP tree stores the frequency at which itemsets occur, and each itemset is stored in the tree as a path, connecting similar elements through a link.
  • Building a FP tree requires scanning the original dataset twice.The first iteration through the dataset obtains the frequency of each element item, eliminating those elements that do not meet the minimum support rate.The second scan only needs to consider frequent elements to build the FP tree.When building a tree, each itemset is sorted according to the absolute frequency of element items, then each itemset is read in and added to an existing path, or a new path is created if the path does not exist.

2. Code

(1) Create FP Tree

# Node to save tree
class TreeNode:
    def __init__(self, nameValue, occurNum, parentNode): = nameValue  # Node name
        self.count = occurNum  # Count Value
        self.nodeLink = None  # Link similar elements
        self.parent = parentNode  # Parent of the current node
        self.children = {}  # All child nodes of the current node

    def increase(self, occurNum):
        self.count += occurNum  # Increase Count Value

    # Display the tree as text
    def display(self, space=1):
        print('  ' * space,, " ", self.count)
        for child in self.children.values():
            child.display(space + 1)

def createTree(dataset, minSup=1):
    headerTable = {}
    # Traverse the entire dataset, counting each element item in each itemset
    for trans in dataset:
        for item in trans:
            """get()Method returns to dictionary item The corresponding value, if not found, returns 0
                item Occurs once, counts+1"""
            headerTable[item] = headerTable.get(item, 0) + dataset[trans]
    for k in list(headerTable.keys()):
        if headerTable[k] < minSup:
            del (headerTable[k])  # Eliminate elements that do not meet minimum support
    freqItemSet = set(headerTable.keys())  # Get a non-repeating set of frequent items
    if len(freqItemSet) == 0: return None, None  # Exit if none of the element items meet minimum support
    for k in headerTable:
        # Expand the headerTable value so that the frequency of each frequent item and the node where the frequent item first appears (use the node as a pointer) are saved simultaneously
        headerTable[k] = [headerTable[k], None]
    retTree = TreeNode('Null Set', 1, None)  # Create Root Node
    for tranSet, count in dataset.items():  # Second traversal (each transaction and the number of times it occurs)
        localD = {}
        for item in tranSet:
            if item in freqItemSet:  # Filter out items that do not meet the requirements
                localD[item] = headerTable[item][0]  # Get the count value for each element item
        if len(localD) > 0:
            # Generate a list of element items arranged from large to small in frequency
            """sorted()Functions follow optional parameters key Sort, reverse=True Indicates from large to small.
            lambda A function encapsulates a simple procedure followed by a colon and a return value. The following procedure represents a return y Value of the second dimension
            //So sort localD.items() from large to small by their second dimension value (count value), and return the key to localD, which is the element item''.
            orderItems = [x[0] for x in sorted(localD.items(), key=lambda y: y[1], reverse=True)]
            updateTree(orderItems, retTree, headerTable, count)  # Update FP Tree
    return retTree, headerTable

# Build FP tree from each filtered and sorted transaction
def updateTree(items, inTree, headerTable, count):
    if items[0] in inTree.children:  # Does the first element item exist in the child nodes of the tree
        inTree.children[items[0]].increase(count)  # If present, the count value of the node is increased by count
        inTree.children[items[0]] = TreeNode(items[0], count, inTree)  # If not, create a new node
        if headerTable[items[0]][1] is None:  # Does the element item have a pointer to it
            headerTable[items[0]][1] = inTree.children[items[0]]  # If not, add a new node as a pointer
        else:  # Update the link to the new node if there is already a pointer to the element item
            updateHeader(headerTable[items[0]][1], inTree.children[items[0]])
    if len(items) > 1:  # Continue building the FP tree for the remaining element items in the items list
        # items begins slicing from the second element, and the last colon indicates that the step size defaults to 1.The following element is updated on the child nodes of the previous element
        updateTree(items[1::], inTree.children[items[0]], headerTable, count)

# Update Pointer Link
def updateHeader(node, targetNode):
    while node.nodeLink is not None:  # Pointer goes down from start to end of node
        node = node.nodeLink
    node.nodeLink = targetNode  # Link the end node to the specified node

if __name__ == "__main__":
    simpdat = [['r', 'z', 'h', 'j', 'p'],
               ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
               ['r', 'x', 'n', 'o', 's'],
               ['y', 'r', 'x', 'z', 'q', 't', 'p'],
               ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
    dict1 = {}
    for i in simpdat:
        dict1[frozenset(i)] = 1
    tree, header=createTree(dict1,minSup=3)

# Result
>>> Null Set   1
      z   5
        r   1
        x   3
          s   2
            y   2
              t   2
          r   1
            y   1
              t   1
      x   1
        r   1
          s   1


(2) Extracting frequent itemsets from a built FP tree

  • Steps:
    (1) Conditional mode base to get the specified item in the FP tree (conditional mode base refers to the set of paths ending with the element item being searched, that is, the content between the element item being searched and the root node)
    (2) Construct conditional FP tree based on conditional pattern
    (3) Repeat the above two steps until the tree contains an element item

  • Gets the conditional mode base__of the specified item

# Upstream the whole tree
def ascendTree(leafNode, prefixPath):
    if leafNode.parent is not None:  # If parent node exists
        prefixPath.append(  # Add current node to prefix path
        ascendTree(leafNode.parent, prefixPath)  # Call itself until the topmost node is found

# Find the path to the specified item
def findPrefixPath(treeNode):
    condPat = {}  # Storage Conditions Mode Base
    while treeNode is not None:
        prefixPath = []  # prefix path
        ascendTree(treeNode, prefixPath)  # Go back to the top node and save the passed node in prefixPath
        if len(prefixPath) > 1:  # If prefix path exists
            # Save path (excluding root node) and count value of current node in dictionary
            condPat[frozenset(prefixPath[1:])] = treeNode.count
        # Depending on the node's link, go to the next node with the same element item
        treeNode = treeNode.nodeLink
    return condPat
if __name__ == "__main__":
    tree, header = createTree(dict1, 3)
    x = findPrefixPath(header['x'][1])

>>> {frozenset({'z'}): 3}
  • Construct conditional FP tree: Construct conditional FP tree for each frequent element item and combination of frequent element items (e.g.'x'can construct FP tree for frequent item, if the combination of'x' and't'is also frequent, then you need to also construct FP tree for ['x','t']
# Create conditional FP tree
def mineTree(headerTable, minSupp, preFix, freqItemList):
    # Gets the element items sorted from smallest to largest in frequency
    bigL = [y[0] for y in sorted(headerTable.items(), key=lambda x: x[1][0])]
    for basePatt in bigL:  # Traverse all frequent items
        newFreqSet = preFix.copy()
        newFreqSet.add(basePatt)  # Store current frequent items
        freqItemList.append(newFreqSet)  # Store all frequent items (including combined frequent items)
        condPatBases = findPrefixPath(headerTable[basePatt][1])  # Find all prefix paths for current frequent items
        mycondTree, myHead = createTree(condPatBases, minSupp)  # Construct conditional FP tree based on prefix path
        if myHead is not None:  # If an element item exists in the FP tree, display and continue calling itself
            print('conditional tree for:', newFreqSet)
            mineTree(myHead, minSupp, newFreqSet, freqItemList)

if __name__ == "__main__":
    freq = []
    mineTree(header, 3, set([]), freq)
    print('freqItems:', freq)

>>> conditional tree for: {'s'}
   	Null Set   1
          x   3
    conditional tree for: {'y'}
   	Null Set   1
   	  x   3
   	    z   3
    conditional tree for: {'y', 'z'}
   	Null Set   1
   	  x   3 
    conditional tree for: {'t'}
   	Null Set   1
    	 y   3
   	    x   3
    	      z   3
    ........................Omit partial results

freqItems: [{'r'}, {'s'}, {'s', 'x'}, {'y'}, {'y', 'x'}, {'y', 'z'}, {'y', 'x', 'z'}, {'t'}, {'t', 'y'}, {'t', 'x'}, {'t', 'x', 'y'}, {'t', 'z'}, {'t', 'z', 'y'}, {'t', 'x', 'z'}, {'t', 'x', 'z', 'y'}, {'x'}, {'x', 'z'}, {'z'}]
Twenty-six original articles were published, 2 were praised, and 667 were visited
Private letter follow

Tags: Lambda

Posted on Wed, 29 Jan 2020 18:16:51 -0800 by tekkenlord