def map_reduce(data, mapper, reducer=None):
'''Simple map/reduce for data analysis.
Each data element is passed to a *mapper* function.
The mapper returns key/value pairs
or None for data elements to be skipped.
Returns a dict with the data grouped into lists.
If a *reducer* is specified, it aggregates each list.
>>> def even_odd(elem): # sample mapper
... if 10 <= elem <= 20: # skip elems outside the range
... key = elem % 2 # group into evens and odds
... return key, elem
>>> map_reduce(range(30), even_odd) # show even/odd grouping
{0: [10, 12, 14, 16, 18, 20], 1: [11, 13, 15, 17, 19]}
>>> map_reduce(range(30), even_odd, sum) # sum each group
{0: 90, 1: 75}
'''
d = {}
for elem in data:
r = mapper(elem)
if r is not None:
key, value = r
d.setdefault(key, []).append(value)
if reducer is not None:
for key, group in d.items():
d[key] = reducer(group)
return d
if __name__ == '__main__':
from collections import namedtuple
from pprint import pprint
Person = namedtuple('Person', ['name', 'gender', 'age', 'height'])
persons = [
Person('mary', 'fem', 21, 60.2),
Person('suzy', 'fem', 32, 70.1),
Person('jane', 'fem', 27, 58.1),
Person('jill', 'fem', 24, 69.1),
Person('bess', 'fem', 43, 66.6),
Person('john', 'mal', 25, 70.8),
Person('jack', 'mal', 40, 59.1),
Person('mike', 'mal', 55, 60.3),
Person('zack', 'mal', 45, 63.7),
Person('alma', 'fem', 22, 67.0),
Person('bill', 'mal', 20, 62.1),
]
def height_by_gender_and_agegroup(p):
key = p.gender, p.age //10
val = p.height
return key, val
def avg(s):
return sum(s) / len(s)
pprint(persons) # input dataset
pprint(map_reduce(persons, lambda p: ((p.gender, p.age//10), p))) # grouped people
pprint(map_reduce(persons, height_by_gender_and_agegroup, None)) # grouped heights
pprint(map_reduce(persons, height_by_gender_and_agegroup, len)) # size of each group
pprint(map_reduce(persons, height_by_gender_and_agegroup, max)) # maximum height by group
pprint(map_reduce(persons, height_by_gender_and_agegroup, avg)) # average height by group
Diff to Previous Revision
--- revision 5 2011-04-25 22:19:08
+++ revision 6 2011-04-25 22:48:42
@@ -13,7 +13,7 @@
... key = elem % 2 # group into evens and odds
... return key, elem
- >>> map_reduce(range(30), even_odd) # group into evens and odds
+ >>> map_reduce(range(30), even_odd) # show even/odd grouping
{0: [10, 12, 14, 16, 18, 20], 1: [11, 13, 15, 17, 19]}
>>> map_reduce(range(30), even_odd, sum) # sum each group
@@ -21,37 +21,37 @@
'''
d = {}
- for entry in data:
- r = mapper(entry)
+ for elem in data:
+ r = mapper(elem)
if r is not None:
- k, v = r
- d.setdefault(k, []).append(v)
+ key, value = r
+ d.setdefault(key, []).append(value)
if reducer is not None:
- for k, group in d.items():
- d[k] = reducer(group)
+ for key, group in d.items():
+ d[key] = reducer(group)
return d
+
if __name__ == '__main__':
from collections import namedtuple
from pprint import pprint
- import doctest
Person = namedtuple('Person', ['name', 'gender', 'age', 'height'])
persons = [
- Person('mary', 'fem', 20, 60.2),
- Person('suzy', 'fem', 30, 50.1),
- Person('jane', 'fem', 20, 58.1),
- Person('jill', 'fem', 20, 49.1),
- Person('bess', 'fem', 40, 56.6),
- Person('john', 'mal', 20, 50.8),
+ Person('mary', 'fem', 21, 60.2),
+ Person('suzy', 'fem', 32, 70.1),
+ Person('jane', 'fem', 27, 58.1),
+ Person('jill', 'fem', 24, 69.1),
+ Person('bess', 'fem', 43, 66.6),
+ Person('john', 'mal', 25, 70.8),
Person('jack', 'mal', 40, 59.1),
- Person('jase', 'mal', 50, 60.3),
- Person('zack', 'mal', 40, 53.7),
- Person('ambr', 'fem', 20, 57.0),
- Person('bill', 'mal', 20, 62.1)
+ Person('mike', 'mal', 55, 60.3),
+ Person('zack', 'mal', 45, 63.7),
+ Person('alma', 'fem', 22, 67.0),
+ Person('bill', 'mal', 20, 62.1),
]
def height_by_gender_and_agegroup(p):
@@ -63,10 +63,8 @@
return sum(s) / len(s)
pprint(persons) # input dataset
- pprint(map_reduce(persons, lambda p: ((p.gender, p.age), p), None)) # grouped people
+ pprint(map_reduce(persons, lambda p: ((p.gender, p.age//10), p))) # grouped people
pprint(map_reduce(persons, height_by_gender_and_agegroup, None)) # grouped heights
pprint(map_reduce(persons, height_by_gender_and_agegroup, len)) # size of each group
pprint(map_reduce(persons, height_by_gender_and_agegroup, max)) # maximum height by group
pprint(map_reduce(persons, height_by_gender_and_agegroup, avg)) # average height by group
-
- print(doctest.testmod())