Simultaneously sum two keys in list of dicts by multiple items

Multi tool use


Simultaneously sum two keys in list of dicts by multiple items
I want to sum two different variables in one function, but I want these to be summed based on multiple other items.
If I have the following list of dicts x:
x=[{'id':1, 'var1':'a', 'var2':'left', 'var3':0.1, 'var4':1},
{'id':2, 'var1':'a', 'var2':'right', 'var3':0.1, 'var4':1},
{'id':2, 'var1':'a', 'var2':'right', 'var3':0.2, 'var4':3},
{'id':4, 'var1':'b', 'var2':'left', 'var3':0.4, 'var4':4},
{'id':5, 'var1':'b', 'var2':'right', 'var3':0.1, 'var4':5},
{'id':5, 'var1':'b', 'var2':'right', 'var3':0.4, 'var4':2}]
Then i can use the following function to sum an individual variable ('var3'), based on two other variables ('var1' and 'var2'):
from operator import itemgetter
from itertools import groupby
def aggregate_var3_by_var1_and_var2(data):
my_data=
grouper = itemgetter("id", "var1", "var2")
for key, grp in groupby(sorted(data, key = grouper), grouper):
temp_dict = dict(zip(["id", "var1", "var2"], key))
temp_dict["var3"] = sum(item["var3"] for item in grp)
my_data.append(temp_dict)
return my_data
my_output = aggregate_var3_by_var1_and_var2(x)
However, I want to sum multiple variables ('var3' and 'var4'), across multiple categories ('var1' and 'var2'), so that the output looks like this:
y=[{'id': 1, 'var1': 'a', 'var2': 'left', 'var3': 0.1, 'var4': 1},
{'id': 2, 'var1': 'a', 'var2': 'right', 'var3': 0.3, 'var4':4},
{'id': 4, 'var1': 'b', 'var2': 'left', 'var3': 0.4, 'var4':4},
{'id': 5, 'var1': 'b', 'var2': 'right', 'var3': 0.5, 'var4':7}]
How do I sum multiple variables at once using this method?
3 Answers
3
You can use Pandas for an efficient, vectorised solution.
The disadvantages of itertools.groupby
here are it requires sorting [additional complexity] and does not implement vectorised calculations [inefficient summing].
itertools.groupby
If you want to go down the loopy route, I recommend collections.defaultdict
to make sure you still have O(n) complexity.
collections.defaultdict
import pandas as pd
df = pd.DataFrame(x)
res = df.groupby(['id', 'var1', 'var2']).agg({'var3': 'sum', 'var4': 'sum'}).reset_index()
print(res.to_dict('records'))
[{'id': 1, 'var1': 'a', 'var2': 'left', 'var3': 0.1, 'var4': 1},
{'id': 2, 'var1': 'a', 'var2': 'right', 'var3': 0.3, 'var4': 4},
{'id': 4, 'var1': 'b', 'var2': 'left', 'var3': 0.4, 'var4': 4},
{'id': 5, 'var1': 'b', 'var2': 'right', 'var3': 0.5, 'var4': 7}]
from itertools import groupby
x=[{'id':1, 'var1':'a', 'var2':'left', 'var3':0.1, 'var4':1},
{'id':2, 'var1':'a', 'var2':'right', 'var3':0.1, 'var4':1},
{'id':2, 'var1':'a', 'var2':'right', 'var3':0.2, 'var4':3},
{'id':4, 'var1':'b', 'var2':'left', 'var3':0.4, 'var4':4},
{'id':5, 'var1':'b', 'var2':'right', 'var3':0.1, 'var4':5},
{'id':5, 'var1':'b', 'var2':'right', 'var3':0.4, 'var4':2}]
res =
for key, value in groupby(x, lambda x: x["id"]):
d = None
for i in value:
if not d:
d = i
else:
d["var3"] += i["var3"]
d["var4"] += i["var4"]
res.append(d)
print(res)
Output:
[{'id': 1, 'var1': 'a', 'var2': 'left', 'var3': 0.1, 'var4': 1},
{'id': 2,
'var1': 'a',
'var2': 'right',
'var3': 0.30000000000000004,
'var4': 4},
{'id': 4, 'var1': 'b', 'var2': 'left', 'var3': 0.4, 'var4': 4},
{'id': 5, 'var1': 'b', 'var2': 'right', 'var3': 0.5, 'var4': 7}]
A straightforward extension of your approach, provide the grouper keys and value keys as arguments:
from operator import itemgetter
from itertools import groupby
from itertools import chain
def reducer(ts):
return map(sum, zip(*ts))
def agg(data, keys, aggfields):
my_data =
getter = itemgetter(*aggfields)
grouper = itemgetter(*keys)
for ks, grp in groupby(sorted(data, key=grouper), grouper):
vs = map(getter, grp)
kvs = chain(zip(keys,ks), zip(aggfields, reducer(vs)))
my_data.append(dict(kvs))
return my_data
In the repl:
In [9]: x=[{'id':1, 'var1':'a', 'var2':'left', 'var3':0.1, 'var4':1},
...: {'id':2, 'var1':'a', 'var2':'right', 'var3':0.1, 'var4':1},
...: {'id':2, 'var1':'a', 'var2':'right', 'var3':0.2, 'var4':3},
...: {'id':4, 'var1':'b', 'var2':'left', 'var3':0.4, 'var4':4},
...: {'id':5, 'var1':'b', 'var2':'right', 'var3':0.1, 'var4':5},
...: {'id':5, 'var1':'b', 'var2':'right', 'var3':0.4, 'var4':2}]
In [10]: agg(x, ['var1','var2'], ['var3','var4'])
Out[10]:
[{'var1': 'a', 'var2': 'left', 'var3': 0.1, 'var4': 1},
{'var1': 'a', 'var2': 'right', 'var3': 0.30000000000000004, 'var4': 4},
{'var1': 'b', 'var2': 'left', 'var3': 0.4, 'var4': 4},
{'var1': 'b', 'var2': 'right', 'var3': 0.5, 'var4': 7}]
By clicking "Post Your Answer", you acknowledge that you have read our updated terms of service, privacy policy and cookie policy, and that your continued use of the website is subject to these policies.