Table Manipulations¶
Join tables and compute grouped summaries.
"""Show joins, grouped summaries, and column management for tables."""
import emzed
# Peak/intensity table measured per sample and compound.
intensities = emzed.Table.create_table(
["sample_id", "compound", "area"],
[str, str, float],
rows=[
["S1", "glucose", 8.10e5],
["S1", "lactate", 2.50e5],
["S2", "glucose", 6.40e5],
["S2", "lactate", 3.10e5],
],
)
meta = emzed.Table.create_table(
["sample_id", "condition"],
[str, str],
rows=[["S1", "control"], ["S2", "treated"]],
)
joined = intensities.join(meta, intensities.sample_id == meta.sample_id)
# Join keeps both same-named key columns:
# left "sample_id" and right "sample_id__0".
# Suffix "__0" denotes the first collided column from
# the right-hand table. After rename_postfixes:
# sample_id__0 -> sample_id_meta,
# condition__0 -> condition_meta.
joined.rename_postfixes(__0="_meta")
# Grouped sum is broadcast back to every row within each sample_id group.
joined.add_column(
"total_area_per_sample",
joined.group_by(joined.sample_id).sum(joined.area),
float,
)
# Row-wise ratio against the grouped total.
joined.add_column(
"relative_area",
joined.area / joined.total_area_per_sample,
float,
)
summary = joined.extract_columns(
"sample_id",
"condition_meta",
"compound",
"area",
"total_area_per_sample",
"relative_area",
).sort_by("sample_id", "compound")
print(summary)
sample_id condition_meta compound area total_area_per_sample relative_area
str str str float float float
--------- -------------- -------- ------------- --------------------- -------------
S1 control glucose 810000.000000 1060000.000000 0.764151
S1 control lactate 250000.000000 1060000.000000 0.235849
S2 treated glucose 640000.000000 950000.000000 0.673684
S2 treated lactate 310000.000000 950000.000000 0.326316