Source code for tdub.data

"""Module for working with datasets."""

from __future__ import annotations

# stdlib
from enum import Enum
from pathlib import PosixPath
import logging
import os
import re
from typing import Union, Set, Dict, Iterable, List, Optional

# external
import formulate
import uproot
from uproot.reading import ReadOnlyDirectory
from uproot.behaviors.TTree import TTree

# tdub
import tdub.config

log = logging.getLogger(__name__)

DataSource = Union[
    str, Iterable[str], os.PathLike, Iterable[os.PathLike], ReadOnlyDirectory, TTree
]


[docs]class Region(Enum):
    """A simple enum class for easily using region information.

    Attributes
    ----------
    r1j1b
        Label for our `1j1b` region.
    r2j1b
        Label for our `2j1b` region.
    r2j2b
        Label for our `2j2b` region.

    Examples
    --------
    Using this enum for grabing the ``2j2b`` region from a set of
    files:

    >>> from tdub.data import Region, selection_for
    >>> from tdub.frames import iterative_selection
    >>> df = iterative_selection(files, selection_for(Region.r2j2b))

    """

    r1j1b = 0
    r2j1b = 1
    r2j2b = 2
    rUnkn = 9

[docs]    @staticmethod
    def from_str(s: str) -> Region:
        """Get enum value for the given string.

        This function supports three ways to define a region; prefixed
        with "r", prefixed with "reg", or no prefix at all. For
        example, ``Region.r2j2b`` can be retrieved like so:

        - ``Region.from_str("r2j2b")``
        - ``Region.from_str("reg2j2b")``
        - ``Region.from_str("2j2b")``

        Parameters
        ----------
        s : str
            String representation of the desired region

        Returns
        -------
        Region
            Enum version

        Examples
        --------
        >>> from tdub.data import Region
        >>> Region.from_str("1j1b")
        <Region.r1j1b: 0>

        """
        if s.startswith("reg"):
            rsuff = s.split("reg")[-1]
            return Region.from_str(rsuff)
        elif s.startswith("r"):
            return Region[s]
        else:
            if s == "2j2b":
                return Region.r2j2b
            elif s == "2j1b":
                return Region.r2j1b
            elif s == "1j1b":
                return Region.r1j1b
            else:
                raise ValueError(f"{s} doesn't correspond to a Region")

    def __str__(self) -> str:
        """Convert to string, removing prefix 'r'."""
        return self.name[1:]


[docs]def as_region(region: Union[str, Region]) -> Region:
    """Convert input to :py:obj:`~Region`.

    Meant to be similar to :py:func:`numpy.asarray` function.

    Parameters
    ----------
    region : str or Region
        Region already as a Region or as a str

    Returns
    -------
    Region
        Region representation.

    Examples
    --------
    >>> from tdub.data import as_region, Region
    >>> as_region("r2j1b")
    <Region.r2j1b: 1>
    >>> as_region(Region.r2j2b)
    <Region.r2j2b: 2>

    """
    if isinstance(region, str):
        return Region.from_str(region)
    return region


[docs]class SampleInfo:
    """Describes a sample's attritubes given it's name.

    Parameters
    ----------
    input_file : str
        File stem containing the necessary groups to parse.

    Attributes
    ----------
    phy_process : str
        Physics process (e.g. ttbar or tW_DR or Zjets)
    dsid : int
        Dataset ID
    sim_type : str
        Simulation type, "FS" or "AFII"
    campaign : str
        Campaign, MC16{a,d,e}
    tree : str
        Original tree (e.g. "nominal" or "EG_SCALE_ALL__1up")

    Examples
    --------
    >>> from tdub.data import SampleInfo
    >>> sampinfo = SampleInfo("ttbar_410472_AFII_MC16d_nominal.root")
    >>> sampinfo.phy_process
    ttbar
    >>> sampinfo.dsid
    410472
    >>> sampinfo.sim_type
    AFII
    >>> sampinfo.campaign
    MC16d
    >>> sampinfo.tree
    nominal

    """

    _parse: re.Pattern = re.compile(
        r"""(?P<phy_process>\w+)_
        (?P<dsid>[0-9]{6})_
        (?P<sim_type>(FS|AFII))_
        (?P<campaign>MC16(a|d|e))_
        (?P<tree>\w+)
        (\.\w+|$)""",
        re.X,
    )

    def __init__(self, input_file: str) -> None:
        """Class constructor."""
        if "Data_Data" in input_file:
            self.phy_process = "Data"
            self.dsid = 0
            self.sim_type = "Data"
            self.campaign = "Data"
            self.tree = "nominal"
        else:
            m: Optional[re.Match] = SampleInfo._parse.match(input_file)
            if not m:
                raise ValueError(f"{input_file} cannot be parsed by SampleInfo regex")
            self.phy_process = m.group("phy_process")
            if self.phy_process.startswith("MCNP"):
                self.phy_process = "MCNP"
            self.dsid = int(m.group("dsid"))
            self.sim_type = m.group("sim_type")
            self.campaign = m.group("campaign")
            self.tree = m.group("tree")


[docs]def avoids_for(region: Union[str, Region]) -> List[str]:
    """Get the features to avoid for the given region.

    See the :py:mod:`tdub.config` module for definition of the
    variables to avoid (and how to modify them).

    Parameters
    ----------
    region : str or tdub.data.Region
        Region to get the associated avoided branches.

    Returns
    -------
    list(str)
        Features to avoid for the region.

    Examples
    --------
    >>> from tdub.data import avoids_for, Region
    >>> avoids_for(Region.r2j1b)
    ['HT_jet1jet2', 'deltaR_lep1lep2_jet1jet2met', 'mass_lep2jet1', 'pT_jet2']
    >>> avoids_for("2j2b")
    ['deltaR_jet1_jet2']

    """
    region = as_region(region)
    if region == Region.r1j1b:
        return tdub.config.AVOID_IN_CLF_1j1b
    elif region == Region.r2j1b:
        return tdub.config.AVOID_IN_CLF_2j1b
    elif region == Region.r2j2b:
        return tdub.config.AVOID_IN_CLF_2j2b
    else:
        raise ValueError(f"Incompatible region: {region}")


[docs]def branches_from(
    source: DataSource,
    tree: str = "WtLoop_nominal",
    ignore_weights: bool = False,
) -> List[str]:
    """Get a list of branches from a data source.

    If the `source` is a list of files, the first file is the only
    file that is parsed.

    Parameters
    ----------
    source : str, list(str), os.PathLike, list(os.PathLike), or uproot File/Tree
        What to parse to get the branch information.
    tree : str
        Name of the tree to get branches from
    ignore_weights : bool
        Flag to ignore all branches starting with `weight_`.

    Returns
    -------
    list(str)
        Branches from the source.

    Raises
    ------
    TypeError
        If `source` can't be used to find a list of branches.

    Examples
    --------
    >>> from tdub.data import branches_from
    >>> branches_from("/path/to/file.root", ignore_weights=True)
    ["pT_lep1", "pT_lep2"]
    >>> branches_from("/path/to/file.root")
    ["pT_lep1", "pT_lep2", "weight_nominal", "weight_tptrw"]

    """
    if isinstance(source, (str, os.PathLike)):
        t = uproot.open(source).get(tree)
    elif isinstance(source, list):
        t = uproot.open(source[0]).get(tree)
    elif isinstance(source, uproot.reading.ReadOnlyDirectory):
        t = source.get(tree)
    elif isinstance(source, uproot.behaviors.TTree.TTree):
        t = source
    else:
        raise TypeError("Cannot use source (it is type %s)" % str(type(source)))
    branches = t.keys()

    if ignore_weights:
        weights = set(filter(re.compile(r"(weight_\w+)").match, branches))
        branches = set(branches) ^ weights

    return list(sorted(branches, key=str.lower))


[docs]def categorize_branches(source: List[str]) -> Dict[str, List[str]]:
    """Categorize branches into a separated lists.

    The categories:

    - `kinematics`: for kinematic features (used for classifiers)
    - `weights`: for any branch that starts or ends with ``weight``
    - `meta`: for meta information (final state information)

    Parameters
    ----------
    source : list(str)
        Complete list of branches to be categorized.

    Returns
    -------
    dict(str, list(str))
        Dictionary connecting categories to their associated list of
        branchess.

    Examples
    --------
    >>> from tdub.data import categorize_branches, branches_from
    >>> branches = ["pT_lep1", "pT_lep2", "weight_nominal", "weight_sys_jvt", "reg2j2b"]
    >>> cated = categorize_branches(branches)
    >>> cated["weights"]
    ['weight_sys_jvt', 'weight_nominal']
    >>> cated["meta"]
    ['reg2j2b']
    >>> cated["kinematics"]
    ['pT_lep1', 'pT_lep2']

    Using a ROOT file:

    >>> root_file = PosixPath("/path/to/file.root")
    >>> cated = categorize_branches(branches_from(root_file))

    """
    metas = {
        "reg1j1b",
        "reg2j1b",
        "reg2j2b",
        "reg1j0b",
        "reg2j0b",
        "isMC16a",
        "isMC16d",
        "isMC16e",
        "OS",
        "SS",
        "elmu",
        "elel",
        "mumu",
        "charge_lep1",
        "charge_lep2",
        "pdgId_lep1",
        "pdgId_lep2",
        "runNumber",
        "randomRunNumber",
        "eventNumber",
    }
    bset = set(source)
    weight_re = re.compile(r"(^weight_\w+)|(\w+_weight$)")
    weights = set(filter(weight_re.match, bset))
    metas = metas & set(bset)
    kinematics = (set(bset) ^ weights) ^ metas
    return {
        "kinematics": sorted(kinematics, key=str.lower),
        "weights": sorted(weights, key=str.lower),
        "meta": sorted(metas, key=str.lower),
    }


[docs]def features_for(region: Union[str, Region]) -> List[str]:
    """Get the feature list for a region.

    See the :py:mod:`tdub.config` module for the definitions of the
    feature lists (and how to modify them).

    Parameters
    ----------
    region : str or tdub.data.Region
        Region as a string or enum entry. Using ``"ALL"`` returns a
        list of unique features from all regions.

    Returns
    -------
    list(str)
        Features for that region (or all regions).

    Examples
    --------
    >>> from pprint import pprint
    >>> from tdub.data import features_for
    >>> pprint(features_for("reg2j1b"))
    ['mass_lep1jet1',
     'mass_lep1jet2',
     'mass_lep2jet1',
     'mass_lep2jet2',
     'pT_jet2',
     'pTsys_lep1lep2jet1jet2met',
     'psuedoContTagBin_jet1',
     'psuedoContTagBin_jet2']

    """
    # first allow retrieval of all features
    if region == "ALL":
        return sorted(
            set(tdub.config.FEATURESET_1j1b)
            | set(tdub.config.FEATURESET_2j1b)
            | set(tdub.config.FEATURESET_2j2b),
            key=str.lower,
        )

    region = as_region(region)
    if region == Region.r1j1b:
        return tdub.config.FEATURESET_1j1b
    if region == Region.r2j1b:
        return tdub.config.FEATURESET_2j1b
    if region == Region.r2j2b:
        return tdub.config.FEATURESET_2j2b
    else:
        raise ValueError(f"Incompatible region: {region}")


[docs]def quick_files(
    datapath: Union[str, os.PathLike],
    campaign: Optional[str] = None,
    tree: str = "nominal",
) -> Dict[str, List[str]]:
    """Get a dictionary connecting sample processes to file lists.

    The lists of files are sorted alphabetically. These types of
    samples are currently tested:

    - `tW_DR` (410648, 410649 full sim)
    - `tW_DR_AFII` (410648, 410649 fast sim)
    - `tW_DR_PS` (411038, 411039 fast sim)
    - `tW_DR_inc` (410646, 410647 full sim)
    - `tW_DR_inc_AFII` (410646, 410647 fast sim)
    - `tW_DS` (410656, 410657 full sim)
    - `tW_DS_inc` (410654, 410655 ful sim)
    - `ttbar` (410472 full sim)
    - `ttbar_AFII` (410472 fast sim)
    - `ttbar_PS` (410558 fast sim)
    - `ttbar_PS713` (411234 fast sim)
    - `ttbar_hdamp` (410482 fast sim)
    - `ttbar_inc` (410470 full sim)
    - `ttbar_inc_AFII` (410470 fast sim)
    - `Diboson`
    - `Zjets`
    - `MCNP`
    - `Data`

    Parameters
    ----------
    datapath : str or os.PathLike
        Path where all of the ROOT files live.
    campaign : str, optional
        Enforce a single campaign ("MC16a", "MC16d", or "MC16e").
    tree : str
        Upstream AnalysisTop ntuple tree.

    Returns
    -------
    dict(str, list(str))
        The dictionary of processes and their associated files.

    Examples
    --------
    >>> from pprint import pprint
    >>> from tdub.data import quick_files
    >>> qf = quick_files("/path/to/some_files") ## has 410472 ttbar samples
    >>> pprint(qf["ttbar"])
    ['/path/to/some/files/ttbar_410472_FS_MC16a_nominal.root',
     '/path/to/some/files/ttbar_410472_FS_MC16d_nominal.root',
     '/path/to/some/files/ttbar_410472_FS_MC16e_nominal.root']
    >>> qf = quick_files("/path/to/some/files", campaign="MC16d")
    >>> pprint(qf["tW_DR"])
    ['/path/to/some/files/tW_DR_410648_FS_MC16d_nominal.root',
     '/path/to/some/files/tW_DR_410649_FS_MC16d_nominal.root']
    >>> qf = quick_files("/path/to/some/files", campaign="MC16a")
    >>> pprint(qf["Data"])
    ['/path/to/some/files/Data15_data15_Data_Data_nominal.root',
     '/path/to/some/files/Data16_data16_Data_Data_nominal.root']

    """
    if campaign is None:
        camp = ""
    else:
        if campaign not in ("MC16a", "MC16d", "MC16e"):
            raise ValueError(f"{campaign} but be either 'MC16a', 'MC16d', or 'MC16e'")
        camp = f"_{campaign}"

    path = str(PosixPath(datapath).resolve())
    files = os.listdir(path)

    patterns = {
        "tW_DR": f"tW_DR_41064(8|9)_FS_MC16(a|d|e)_{tree}.root$",
        "tW_DR_AFII": f"tW_DR_41064(8|9)_AFII_MC16(a|d|e)_{tree}.root$",
        "tW_DR_PS": f"tW_DR_41103(8|9)_AFII_MC16(a|d|e)_{tree}.root$",
        "tW_DR_inc": f"tW_DR_41064(6|7)_FS_MC16(a|d|e)_{tree}.root$",
        "tW_DR_inc_AFII": f"tW_DR_41064(6|7)_AFII_MC16(a|d|e)_{tree}.root$",
        "tW_DS": f"tW_DS_41065(6|7)_FS_MC16(a|d|e)_{tree}.root$",
        "tW_DS_inc": f"tW_DS_41065(4|5)_FS_MC16(a|d|e)_{tree}.root$",
        "ttbar": f"ttbar_410472_FS_MC16(a|d|e)_{tree}.root$",
        "ttbar_AFII": f"ttbar_410472_AFII_MC16(a|d|e)_{tree}.root$",
        "ttbar_PS": f"ttbar_410558_AFII_MC16(a|d|e)_{tree}.root$",
        "ttbar_PS713": f"ttbar_411234_AFII_MC16(a|d|e)_{tree}.root$",
        "ttbar_hdamp": f"ttbar_410482_AFII_MC16(a|d|e)_{tree}.root$",
        "ttbar_inc": f"ttbar_410470_FS_MC16(a|d|e)_{tree}.root$",
        "ttbar_inc_AFII": f"ttbar_410470_AFII_MC16(a|d|e)_{tree}.root$",
        "Diboson": f"Diboson_[0-9]{{6}}_FS_MC16(a|d|e)_{tree}.root$",
        "Zjets": f"Zjets_[0-9]{{6}}_FS_MC16(a|d|e)_{tree}.root$",
        "MCNP": f"MCNP_[0-9]{{6}}_FS_MC16(a|d|e)_{tree}.root$",
    }

    if campaign == "MC16a":
        patterns["Data"] = f"Data1(5|6)_data1(5|6)_Data_Data_{tree}.root$"
    elif campaign == "MC16d":
        patterns["Data"] = f"Data17_data17_Data_Data_{tree}.root$"
    elif campaign == "MC16e":
        patterns["Data"] = f"Data18_data18_Data_Data_{tree}.root$"
    else:
        patterns["Data"] = f"Data1(5|6|7|8)_data1(5|6|7|8)_Data_Data_{tree}.root$"

    patterns = {k: re.compile(v) for k, v in patterns.items()}
    file_lists = {}
    for k, p in patterns.items():
        file_lists[k] = [f"{path}/{entry}" for entry in sorted(filter(p.match, files))]

    if campaign is not None:
        for k, v in file_lists.items():
            if k == "Data":
                continue
            file_lists[k] = sorted(filter(lambda x: camp in PosixPath(x).name, v))

    return file_lists


[docs]def selection_as_numexpr(selection: str) -> str:
    """Get the numexpr selection string from an arbitrary selection.

    Parameters
    -----------
    selection : str
        Selection string in ROOT or numexpr

    Returns
    -------
    str
        Selection in numexpr format.

    Examples
    --------
    >>> selection = "reg1j1b == true && OS == true && mass_lep1jet1 < 155"
    >>> from tdub.data import selection_as_numexpr
    >>> selection_as_numexpr(selection)
    '(reg1j1b == True) & (OS == True) & (mass_lep1jet1 < 155)'

    """
    return formulate.from_auto(selection).to_numexpr()


[docs]def selection_as_root(selection: str) -> str:
    """Get the ROOT selection string from an arbitrary selection.

    Parameters
    -----------
    selection : str
        The selection string in ROOT or numexpr

    Returns
    -------
    str
        The same selection in ROOT format.

    Examples
    --------
    >>> selection = "(reg1j1b == True) & (OS == True) & (mass_lep1jet1 < 155)"
    >>> from tdub.data import selection_as_root
    >>> selection_as_root(selection)
    '(reg1j1b == true) && (OS == true) && (mass_lep1jet1 < 155)'

    """
    return formulate.from_auto(selection).to_root()


[docs]def selection_branches(selection: str) -> Set[str]:
    """Construct the minimal set of branches required for a selection.

    Parameters
    -----------
    selection : str
        Selection string in ROOT or numexpr

    Returns
    -------
    set(str)
        Necessary branches/variables

    Examples
    --------
    >>> from tdub.data import minimal_selection_branches
    >>> selection = "(reg1j1b == True) & (OS == True) & (mass_lep1lep2 > 100)"
    >>> minimal_branches(selection)
    {'OS', 'mass_lep1lep2', 'reg1j1b'}
    >>> selection = "reg2j1b == true && OS == true && (mass_lep1jet1 < 155)"
    >>> minimal_branches(selection)
    {'OS', 'mass_lep1jet1', 'reg2j1b'}

    """
    return formulate.from_auto(selection).variables


[docs]def selection_for(region: Union[str, Region], additional: Optional[str] = None) -> str:
    """Get the selection for a given region.

    We have three regions with a default selection (`1j1b`, `2j1b`,
    and `2j2b`), these are the possible argument options (in str or
    Enum form). See the :py:mod:`tdub.config` module for the
    definitions of the selections (and how to modify them).

    Parameters
    ----------
    region : str or Region
        Region to get the selection for
    additional : str, optional
        Additional selection (in ROOT or numexpr form). This will
        connect the region specific selection using `and`.

    Returns
    -------
    str
        Selection string in numexpr format.

    Examples
    --------
    >>> from tdub.data import Region, selection_for
    >>> selection_for(Region.r2j1b)
    '(reg2j1b == True) & (OS == True)'
    >>> selection_for("reg1j1b")
    '(reg1j1b == True) & (OS == True)'
    >>> selection_for("2j2b")
    '(reg2j2b == True) & (OS == True)'
    >>> selection_for("2j2b", additional="minimaxmbl < 155")
    '((reg2j2b == True) & (OS == True)) & (minimaxmbl < 155)'
    >>> selection_for("2j1b", additional="mass_lep1jetb < 155 && mass_lep2jetb < 155")
    '((reg1j1b == True) & (OS == True)) & ((mass_lep1jetb < 155) & (mass_lep2jetb < 155))'

    """
    region = as_region(region)
    if region == Region.r1j1b:
        selection = "(reg1j1b == True) & (OS == True)"
    elif region == Region.r2j1b:
        selection = "(reg2j1b == True) & (OS == True)"
    elif region == Region.r2j2b:
        selection = "(reg2j2b == True) & (OS == True)"
    else:
        raise ValueError("Incompatible region used")

    if additional is not None:
        additional = selection_as_numexpr(additional)
        selection = f"({selection}) & ({additional})"

    return selection