Source code for tdub.frames

"""Module for handling dataframes."""

# stdlib
import logging
import re

from typing import Optional, Union, List, Iterable

# externals
import pandas as pd
import uproot

# tdub
import tdub.config
from tdub.data import (
    Region,
    avoids_for,
    categorize_branches,
    branches_from,
    selection_branches,
    selection_as_numexpr,
)


log = logging.getLogger(__name__)


[docs]def raw_dataframe( files: Union[str, List[str]], tree: str = "WtLoop_nominal", weight_name: str = "weight_nominal", branches: Optional[Iterable[str]] = None, drop_weight_sys: bool = False, **kwargs, ) -> pd.DataFrame: """Construct a raw pandas flavored Dataframe with help from uproot. We call this dataframe "raw" because it hasn't been parsed by any other tdub.frames functionality (no selection performed, kinematic and weight branches won't be separated, etc.) -- just a pure raw dataframe from some ROOT files. Extra `kwargs` are fed to uproot's ``arrays()`` interface. Parameters ---------- files : list(str) or str Single ROOT file or list of ROOT files. tree : str The tree name to turn into a dataframe. weight_name: str Weight branch (we make sure to grab it if you give something other than ``None`` to ``branches``). branches : list(str), optional List of branches to include as columns in the dataframe, default is ``None``, includes all branches. drop_weight_sys : bool Drop all weight systematics from the being grabbed. Returns ------- pandas.DataFrame The pandas flavored DataFrame with all requested branches Examples -------- >>> from tdub.data import quick_files >>> from tdub.frames import raw_dataframe >>> files = quick_files("/path/to/files")["ttbar"] >>> df = raw_dataframe(files) """ if branches is not None: branches = sorted(set(branches) | {weight_name}, key=str.lower) else: branches = branches_from(files, tree) if weight_name not in branches: raise RuntimeError(f"{weight_name} not present in {tree}") if drop_weight_sys: weight_sys_re = re.compile(r"^weight_sys\w+") branches = sorted( set(branches) ^ set(filter(weight_sys_re.match, branches)), key=str.lower ) if isinstance(files, str): files = [files] result = pd.concat( [uproot.open(f).get(tree).arrays(branches, library="pd", **kwargs) for f in files] ) result.selection_used = None return result
[docs]def iterative_selection( files: Union[str, List[str]], selection: str, tree: str = "WtLoop_nominal", weight_name: str = "weight_nominal", branches: Optional[List[str]] = None, keep_category: Optional[str] = None, exclude_avoids: bool = False, use_campaign_weight: bool = False, use_tptrw: bool = False, use_trrw: bool = False, sample_frac: Optional[float] = None, **kwargs, ) -> pd.DataFrame: """Build a selected dataframe via uproot's iterate. If we want to build a memory-hungry dataframe and apply a selection this helps us avoid crashing due to using all of our RAM. Constructing a dataframe with this function is useful when we want to grab many branches in a large dataset that won't fit in memory before the selection. The selection can be in either numexpr or ROOT form, we ensure that a ROOT style selection is converted to numexpr for use with :py:func:`pandas.eval`. Parameters ---------- files : list(str) or str A single ROOT file or list of ROOT files. selection : str Selection string (numexpr or ROOT form accepted). tree : str Tree name to turn into a dataframe. weight_name: str Weight branch to preserve. branches : list(str), optional List of branches to include as columns in the dataframe, default is ``None`` (all branches). keep_category : str, optional If not ``None``, the selected dataframe(s) will only include columns which are part of the given category (see :py:func:`tdub.data.categorize_branches`). The weight branch is always kept. exclude_avoids : bool Exclude branches defined by :py:data:`tdub.config.AVOID_IN_CLF`. use_campaign_weight : bool Multiply the nominal weight by the campaign weight. this is potentially necessary if the samples were prepared without the campaign weight included in the product which forms the nominal weight. use_tptrw : bool Apply the top pt reweighting factor. use_trrw : bool Apply the top recursive reweighting factor. sample_frac : float, optional Sample a fraction of the available data. Returns ------- pandas.DataFrame The final selected dataframe(s) from the files. Examples -------- Creating a ``ttbar_df`` dataframe a single ``tW_df`` dataframe: >>> from tdub.frames import iterative_selection >>> from tdub.data import quick_files >>> from tdub.data import selection_for >>> qf = quick_files("/path/to/files") >>> ttbar_dfs = iterative_selection(qf["ttbar"], selection_for("2j2b"), ... entrysteps="1 GB") >>> tW_df = iterative_selection(qf["tW_DR"], selection_for("2j2b")) Keep only kinematic branches after selection and ignore avoided columns: >>> tW_df = iterative_selection(qf["tW_DR"], ... selection_for("2j2b"), ... exclue_avoids=True, ... keep_category="kinematics") """ # determine which branches will be used for selection only and # which branches we need for weights sel_branches = selection_branches(selection) weights_to_grab = {weight_name} if use_campaign_weight: weights_to_grab.add("weight_campaign") log.info("applying the campaign weight") if use_tptrw: weights_to_grab.add("weight_tptrw_tool") log.info("applying the top pt reweighting factor") if use_trrw: weights_to_grab.add("weight_trrw_tool") log.info("applying the top recursive reweighting factor") if sample_frac is not None: log.info(f"Sampling {100 * sample_frac}% of events") if branches is None: branches = set(branches_from(files, tree=tree)) branches = set(branches) sel_only_branches = sel_branches - branches # determine which branches to keep after reading dataframes and # are necessary during reading. if keep_category is not None: branches_cated = categorize_branches(list(branches)) keep_cat = set(branches_cated.get(keep_category)) keep = keep_cat & branches read_branches = list(keep | weights_to_grab | sel_branches) else: keep = branches read_branches = list(branches | weights_to_grab | sel_branches) # drop avoided classifier variables if exclude_avoids: keep = keep - set(tdub.config.AVOID_IN_CLF) # always drop selection only branches keep = keep - sel_only_branches # always keep the requested weight (enforce here just in # case). sort into a list and move on to dataframes keep.add(weight_name) keep = sorted(keep, key=str.lower) if isinstance(files, str): files = [files] numexpr_sel = selection_as_numexpr(selection) dfs = [] for i, f in enumerate(files): df = uproot.open(f).get(tree).arrays(read_branches, library="pd", **kwargs) if sample_frac is not None: df = df.sample(frac=sample_frac, random_state=tdub.config.RANDOM_STATE) if use_campaign_weight: apply_weight_campaign(df) if use_tptrw: apply_weight_tptrw(df) if use_trrw: apply_weight_trrw(df) idf = df.query(numexpr_sel) idf = idf[keep] dfs.append(idf) log.debug(f"finished iteration {i}") result = pd.concat(dfs) result.selection_used = numexpr_sel return result
[docs]def satisfying_selection(*dfs: pd.DataFrame, selection: str) -> List[pd.DataFrame]: """Get subsets of dataframes that satisfy a selection. The selection string can be in either ROOT or numexpr form (we ensure to convert ROOT to numexpr). Parameters ---------- *dfs : sequence of :py:obj:`pandas.DataFrame` Dataframes to apply the selection to. selection : str Selection string (in numexpr or ROOT form). Returns ------- list(pandas.DataFrame) Dataframes satisfying the selection string. Examples -------- >>> from tdub.data import quick_files >>> from tdub.frames import raw_dataframe, satisfying_selection >>> qf = quick_files("/path/to/files") >>> df_tW_DR = raw_dataframe(qf["tW_DR"]) >>> df_ttbar = raw_dataframe(qf["ttbar"]) >>> low_bdt = "(bdt_response < 0.4)" >>> tW_DR_selected, ttbar_selected = satisfying_selection( ... dfim_tW_DR.df, dfim_ttbar.df, selection=low_bdt ... ) """ numexprsel = selection_as_numexpr(selection) newdfs = [] for df in dfs: newdf = df.query(numexprsel) newdf.selection_used = numexprsel newdfs.append(newdf) return newdfs
[docs]def drop_cols(df: pd.DataFrame, *cols: str) -> None: """Drop some columns from a dataframe. This is a convenient function because it just ignores branches that don't exist in the dataframe that are present in ``cols``. We augment :py:class:`pandas.DataFrame` with this function Parameters ---------- df : :py:obj:`pandas.DataFrame` Dataframe which we want to slim. *cols : sequence of strings Columns to remove Examples -------- >>> import pandas as pd >>> from tdub.data import drop_cols >>> df = pd.read_parquet("some_file.parquet") >>> "E_jet1" in df.columns: True >>> "mass_jet1" in df.columns: True >>> "mass_jet2" in df.columns: True >>> drop_cols(df, "E_jet1", "mass_jet1") >>> "E_jet1" in df.columns: False >>> "mass_jet1" in df.columns: False >>> df.drop_cols("mass_jet2") # use augmented df class >>> "mass_jet2" in df.columns: False """ in_dataframe = set(df.columns) in_cols = set(cols) in_both = list(in_dataframe & in_cols) log.debug("Dropping columns:") for c in in_both: log.debug(f" - {c}") df.drop(columns=in_both, inplace=True)
[docs]def drop_avoid(df: pd.DataFrame, region: Optional[Union[str, Region]] = None) -> None: """Drop columns that we avoid in classifiers. Uses :py:func:`tdub.frames.drop_cols` with a predefined set of columns (:py:data:`tdub.config.AVOID_IN_CLF`). We augment :py:class:`pandas.DataFrame` with this function. Parameters ---------- df : pandas.DataFrame Dataframe that you want to slim. region : optional, str or tdub.data.Region Region to augment the list of dropped columns (see the region specific AVOID constants in the config module). Examples -------- >>> from tdub.frames import drop_avoid >>> import pandas as pd >>> df = pd.read_parquet("some_file.parquet") >>> "E_jetL1" in df.columns: True >>> drop_avoid(df) >>> "E_jetL1" in df.columns: False """ to_drop = tdub.config.AVOID_IN_CLF if region is not None: to_drop += avoids_for(region) drop_cols(df, *to_drop)
[docs]def drop_jet2(df: pd.DataFrame) -> None: """Drop all columns with jet2 properties. In the 1j1b region we obviously don't have a second jet; so this lets us get rid of all columns dependent on jet2 kinematic properties. We augment :py:class:`pandas.DataFrame` with this function. Parameters ---------- df : pandas.DataFrame Dataframe that we want to slim. Examples -------- >>> from tdub.frames import drop_jet2 >>> import pandas as pd >>> df = pd.read_parquet("some_file.parquet") >>> "pTsys_lep1lep2jet1jet2met" in df.columns: True >>> drop_jet2(df) >>> "pTsys_lep1lep2jet1jet2met" in df.columns: False """ j2cols = [col for col in df.columns if "jet2" in col] drop_cols(df, *j2cols)
[docs]def apply_weight( df: pd.DataFrame, weight_name: str, exclude: Optional[List[str]] = None ) -> None: """Apply (multiply) a weight to all other weights in the DataFrame. This will multiply the nominal weight and all systematic weights in the DataFrame by the ``weight_name`` column. We augment :py:class:`pandas.DataFrame` with this function. Parameters ---------- df : pandas.DataFrame Dataaframe to operate on. weight_name : str Column name to multiple all other weight columns by. exclude : list(str), optional List of columns ot exclude when determining the other weight columns to operate on. Examples -------- >>> import tdub.frames >>> df = tdub.frames.raw_dataframe("/path/to/file.root") >>> df.apply_weight("weight_campaign") """ sys_weight_cols = [c for c in df.columns if "weight_sys" in c] cols = ["weight_nominal"] + sys_weight_cols if exclude is not None: for entry in exclude: if entry in cols: cols.remove(entry) if weight_name in cols: log.warn(f"{weight_name} is in the columns list, dropping") cols.remove(weight_name) log.info(f"Applying {weight_name} to all weights in dataframe.") df.loc[:, cols] = df.loc[:, cols].multiply(df.loc[:, weight_name], axis="index")
def apply_weight_inverse( df: pd.DataFrame, weight_name: str, exclude: Optional[List[str]] = None ) -> None: """Apply an inverse weight (via division) to all other weights in the DataFrame. This will divide the nominal weight and all systematic weights in the DataFrame by the ``weight_name`` column. We augment :py:class:`pandas.DataFrame` with this function. Parameters ---------- df : pandas.DataFrame Dataaframe to operate on. weight_name : str Column name to divide all other weight columns by. exclude : list(str), optional List of columns ot exclude when determining the other weight columns to operate on. Examples -------- >>> import tdub.frames >>> df = tdub.frames.raw_dataframe("/path/to/file.root") >>> df.apply_weight_inverse("weight_tptrw_tool") """ sys_weight_cols = [c for c in df.columns if "weight_sys" in c] cols = ["weight_nominal"] + sys_weight_cols if exclude is not None: for entry in exclude: if entry in cols: cols.remove(entry) if weight_name in cols: log.warn(f"{weight_name} is in the columns list, dropping") cols.remove(weight_name) df.loc[:, cols] = df.loc[:, cols].divide(df.loc[:, weight_name], axis="index")
[docs]def apply_weight_campaign(df: pd.DataFrame, exclude: Optional[List[str]] = None) -> None: """Multiply nominal and systematic weights by the campaign weight. This is useful for samples that were produced without the campaign weight term already applied to all other weights. We augment :py:class:`pandas.DataFrame` with this function. Parameters ---------- df : pandas.DataFrame Dataframe to operate on. exclude : list(str), optional List of columns to exclude when determining the other weight columns to operate on. Examples -------- >>> import tdub.frames >>> df = tdub.frames.raw_dataframe("/path/to/file.root") >>> df.weight_nominal[5] 0.003 >>> df.weight_campaign[5] 0.4 >>> df.apply_weight_campaign() >>> df.weight_nominal[5] 0.0012 """ apply_weight(df, "weight_campaign", exclude=exclude)
[docs]def apply_weight_tptrw(df: pd.DataFrame, exclude: Optional[List[str]] = None) -> None: """Multiply nominal and systematic weights by the top pt reweight term. This is useful for samples that were produced without the top pt reweighting term already applied to all other weights. We augment :py:class:`pandas.DataFrame` with this function. Parameters ---------- df : pandas.DataFrame Dataframe to operate on. exclude : list(str), optional List of columns to exclude when determining the other weight columns to operate on. Examples -------- >>> import tdub.frames >>> df = tdub.frames.raw_dataframe("/path/to/file.root") >>> df.weight_nominal[5] 0.002 >>> df.weight_tptrw_tool[5] 0.98 >>> df.apply_weight_tptrw() >>> df.weight_nominal[5] 0.00196 """ excludes = ["weight_sys_noreweight"] if exclude is not None: excludes += exclude apply_weight(df, "weight_tptrw_tool", exclude=excludes)
[docs]def apply_weight_trrw(df: pd.DataFrame, exclude: Optional[List[str]] = None) -> None: """Multiply nominal and systematic weights by the top recursive reweight term. This is useful for samples that were produced without the top recursive reweighting term already applied to all other weights. We augment :py:class:`pandas.DataFrame` with this function. Parameters ---------- df : pandas.DataFrame Dataframe to operate on. exclude : list(str), optional List of columns to exclude when determining the other weight columns to operate on. Examples -------- >>> import tdub.frames >>> df = tdub.frames.raw_dataframe("/path/to/file.root") >>> df.weight_nominal[5] 0.002 >>> df.weight_trrw_tool[5] 0.98 >>> df.apply_weight_trrw() >>> df.weight_nominal[5] 0.00196 """ excludes = ["weight_sys_noreweight"] if exclude is not None: excludes += exclude apply_weight(df, "weight_trrw_tool", exclude=excludes)
pd.DataFrame.drop_cols = drop_cols pd.DataFrame.drop_avoid = drop_avoid pd.DataFrame.drop_jet2 = drop_jet2 pd.DataFrame.apply_weight = apply_weight pd.DataFrame.apply_weight_campaign = apply_weight_campaign pd.DataFrame.apply_weight_tptrw = apply_weight_tptrw pd.DataFrame.apply_weight_trrw = apply_weight_trrw