Welcome, guest | Sign In | My Account | Store | Cart
def map_reduce(data, mapper, reducer=None):
    '''Simple map/reduce for data analysis.

    Each data element is passed to a *mapper* function.
    The mapper returns key/value pairs
    or None for data elements to be skipped.

    Returns a dict with the data grouped into lists.
    If a *reducer* is specified, it aggregates each list.

    >>> def even_odd(elem):                     # sample mapper
    ...     if 10 <= elem <= 20:                # skip elems outside the range
    ...         key = elem % 2                  # group into evens and odds
    ...         return key, elem

    >>> map_reduce(range(30), even_odd)         # show even/odd grouping
    {0: [10, 12, 14, 16, 18, 20], 1: [11, 13, 15, 17, 19]}

    >>> map_reduce(range(30), even_odd, sum)    # sum each group
    {0: 90, 1: 75}

    '''
    d = {}
    for elem in data:
        r = mapper(elem)
        if r is not None:
            key, value = r
            d.setdefault(key, []).append(value)
    if reducer is not None:
        for key, group in d.items():
            d[key] = reducer(group)
    return d



if __name__ == '__main__':

    from collections import namedtuple
    from pprint import pprint

    Person = namedtuple('Person', ['name', 'gender', 'age', 'height'])

    persons = [
        Person('mary', 'fem', 21, 60.2),
        Person('suzy', 'fem', 32, 70.1),
        Person('jane', 'fem', 27, 58.1),
        Person('jill', 'fem', 24, 69.1),
        Person('bess', 'fem', 43, 66.6),
        Person('john', 'mal', 25, 70.8),
        Person('jack', 'mal', 40, 59.1),
        Person('mike', 'mal', 55, 60.3),
        Person('zack', 'mal', 45, 63.7),
        Person('alma', 'fem', 22, 67.0),
        Person('bill', 'mal', 20, 62.1),
    ]

    def height_by_gender_and_agegroup(p):
        key = p.gender, p.age //10
        val = p.height
        return key, val

    def avg(s):
        return sum(s) / len(s)

    pprint(persons)                                                      # input dataset
    pprint(map_reduce(persons, lambda p: ((p.gender, p.age//10), p)))    # grouped people
    pprint(map_reduce(persons, height_by_gender_and_agegroup, None))     # grouped heights
    pprint(map_reduce(persons, height_by_gender_and_agegroup, len))      # size of each group
    pprint(map_reduce(persons, height_by_gender_and_agegroup, max))      # maximum height by group
    pprint(map_reduce(persons, height_by_gender_and_agegroup, avg))      # average height by group

Diff to Previous Revision

--- revision 5 2011-04-25 22:19:08
+++ revision 6 2011-04-25 22:48:42
@@ -13,7 +13,7 @@
     ...         key = elem % 2                  # group into evens and odds
     ...         return key, elem
 
-    >>> map_reduce(range(30), even_odd)     # group into evens and odds
+    >>> map_reduce(range(30), even_odd)         # show even/odd grouping
     {0: [10, 12, 14, 16, 18, 20], 1: [11, 13, 15, 17, 19]}
 
     >>> map_reduce(range(30), even_odd, sum)    # sum each group
@@ -21,37 +21,37 @@
 
     '''
     d = {}
-    for entry in data:
-        r = mapper(entry)
+    for elem in data:
+        r = mapper(elem)
         if r is not None:
-            k, v = r
-            d.setdefault(k, []).append(v)
+            key, value = r
+            d.setdefault(key, []).append(value)
     if reducer is not None:
-        for k, group in d.items():
-            d[k] = reducer(group)
+        for key, group in d.items():
+            d[key] = reducer(group)
     return d
+
 
 
 if __name__ == '__main__':
 
     from collections import namedtuple
     from pprint import pprint
-    import doctest
 
     Person = namedtuple('Person', ['name', 'gender', 'age', 'height'])
 
     persons = [
-        Person('mary', 'fem', 20, 60.2),
-        Person('suzy', 'fem', 30, 50.1),
-        Person('jane', 'fem', 20, 58.1),
-        Person('jill', 'fem', 20, 49.1),
-        Person('bess', 'fem', 40, 56.6),
-        Person('john', 'mal', 20, 50.8),
+        Person('mary', 'fem', 21, 60.2),
+        Person('suzy', 'fem', 32, 70.1),
+        Person('jane', 'fem', 27, 58.1),
+        Person('jill', 'fem', 24, 69.1),
+        Person('bess', 'fem', 43, 66.6),
+        Person('john', 'mal', 25, 70.8),
         Person('jack', 'mal', 40, 59.1),
-        Person('jase', 'mal', 50, 60.3),
-        Person('zack', 'mal', 40, 53.7),
-        Person('ambr', 'fem', 20, 57.0),
-        Person('bill', 'mal', 20, 62.1)
+        Person('mike', 'mal', 55, 60.3),
+        Person('zack', 'mal', 45, 63.7),
+        Person('alma', 'fem', 22, 67.0),
+        Person('bill', 'mal', 20, 62.1),
     ]
 
     def height_by_gender_and_agegroup(p):
@@ -63,10 +63,8 @@
         return sum(s) / len(s)
 
     pprint(persons)                                                      # input dataset
-    pprint(map_reduce(persons, lambda p: ((p.gender, p.age), p), None))  # grouped people
+    pprint(map_reduce(persons, lambda p: ((p.gender, p.age//10), p)))    # grouped people
     pprint(map_reduce(persons, height_by_gender_and_agegroup, None))     # grouped heights
     pprint(map_reduce(persons, height_by_gender_and_agegroup, len))      # size of each group
     pprint(map_reduce(persons, height_by_gender_and_agegroup, max))      # maximum height by group
     pprint(map_reduce(persons, height_by_gender_and_agegroup, avg))      # average height by group
-
-    print(doctest.testmod())

History