Source code for tdub.features

"""Module for selecting features."""

# stdlib
import copy
import gc
import logging
import os
import json
from pathlib import PosixPath

from typing import Optional, List, Dict, Any, Union, Tuple

# externals
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# tdub
from tdub.frames import iterative_selection, drop_cols
from tdub.data import quick_files, Region, selection_for
import tdub.config

log = logging.getLogger(__name__)


[docs]class FeatureSelector:
    """A class to steer the steps of feature selection.

    Parameters
    ----------
    df : pandas.DataFrame
       The dataframe which contains signal and background events; it
       should also only contain features we wish to test for (it is
       expected to be "clean" from non-kinematic information, like
       metadata and weights).
    weights : numpy.ndarray
       the weights array compatible with the dataframe
    importance_type : str
       the importance type ("gain" or "split")
    labels : numpy.ndarray
       array of labels compatible with the dataframe (``1`` for
       :math:`tW` and ``0`` for :math:`t\\bar{t}`.
    corr_threshold : float
       the threshold for excluding features based on correlations
    name : str, optional
       give the selector a name

    Attributes
    ----------
    data : pandas.DataFrame
       the raw dataframe as fed to the class instance
    weights : numpy.ndarray
       the raw weights array compatible with the dataframe
    labels : numpy.ndarray
       the raw labels array compatible with the dataframe (we expect
       ``1`` for signal, :math:`tW`, and ``0`` for background,
       :math:`t\\bar{t}`).
    raw_features : list(str)
       the list of all features determined at initialization
    name : str, optional
       a name for the selector pipeline, required to save the result)
    corr_threshold : float
       the threshold for excluding features based on correlations
    default_clf_opts : dict
       the default arguments we initialize classifiers with.
    corr_matrix : pandas.DataFrame
       the raw correlation matrix for the features (requires calling the
       ``check_collinearity`` function)
    correlated : pandas.DataFrame
       a dataframe matching features that satisfy the correlation threshold
    importances : pandas.DataFrame
       the importances as determined by a vanilla GBDT (requires
       calling the ``check_importances`` function)
    candidates : list(str)
       list of candiate featurese (sorted by importance) as determined
       by calling the ``check_candidates``
    iterative_remove_aucs : dict(str, float)
       a dictionary of the form ``{feature : auc}`` providing the AUC
       value for a BDT trained _without_ the feature given in the
       key. The keys are built from the ``candidates`` list.
    iterative_add_aucs : numpy.ndarray
       an array of AUC values built by iteratively adding the next
       best feature in the candidates list. (the first entry is
       calculated using only the top feature, the second entry uses
       the top 2 features, and so on).

    Examples
    --------
    >>> from tdub.features import FeatureSelector, prepare_from_parquet
    >>> df, labels, weights = prepare_from_parquet("/path/to/pq/output", "2j1b", "DR")
    >>> fs = FeatureSelector(df=df, labels=labels, weights=weights, corr_threshold=0.90)

    """

    def __init__(
        self,
        df: pd.DataFrame,
        labels: np.ndarray,
        weights: np.ndarray,
        importance_type: str = "gain",
        corr_threshold: float = 0.85,
        name: Optional[str] = None,
    ) -> None:
        assert np.unique(labels).shape[0] == 2, "labels should have 2 unique values"
        assert labels.shape == weights.shape, "labels and weights must have identical shape"
        assert corr_threshold < 1.0, "corr_threshold must be less than 1.0"
        assert (
            df.shape[0] == weights.shape[0]
        ), "df and weights must have the same number of entries"

        # hidden behind properties
        self._df = df
        self._weights = weights
        self._labels = labels
        self._raw_features = df.columns.to_list()

        # completely hidden
        self._nsig = self._labels[self._labels == 1].shape[0]
        self._nbkg = self._labels[self._labels == 0].shape[0]
        self._scale_pos_weight = self._nbkg / self._nsig

        # self._sow_sig = np.sum(self._weights[self._labels == 1])
        # self._sow_bkg = np.sum(self._weights[self._labels == 0])
        # self._weights[self._labels == 1] *= self._sow_bkg / self._sow_sig
        # self._weights *= len(self._weights) / np.sum(self._weights)

        # public attributes
        self.name = name
        self.corr_threshold = corr_threshold
        self.default_clf_opts = dict(
            boosting_type="gbdt",
            importance_type=importance_type,
            learning_rate=0.1,
            n_estimators=200,
            num_leaves=100,
            max_depth=5,
        )

        # Calculated later by some member functions
        # we hide these behind properties
        self._corr_matrix = None
        self._correlated = None
        self._importances = None
        self._candidates = None
        self._iterative_remove_aucs = None
        self._iterative_add_aucs = None
        self._model_params = None

    @property
    def df(self) -> pd.DataFrame:
        return self._df

    @property
    def weights(self) -> np.ndarray:
        return self._weights

    @property
    def labels(self) -> np.ndarray:
        return self._labels

    @property
    def raw_features(self) -> List[str]:
        return self._raw_features

    @property
    def corr_matrix(self) -> pd.DataFrame:
        return self._corr_matrix

    @property
    def correlated(self) -> pd.DataFrame:
        return self._correlated

    @property
    def importances(self) -> pd.DataFrame:
        return self._importances

    @property
    def candidates(self) -> List[str]:
        return self._candidates

    @property
    def iterative_add_aucs(self) -> List[str]:
        return self._iterative_add_aucs

    @property
    def model_params(self) -> Dict[str, Any]:
        return self._model_params

[docs]    def check_for_uniques(self, and_drop: bool = True) -> None:
        """Check the dataframe for features that have a single unique value.

        Parameters
        ----------
        and_drop : bool
           If ``True``, and_drop any unique columns.

        Examples
        --------
        >>> from tdub.features import FeatureSelector, prepare_from_parquet
        >>> df, labels, weights = prepare_from_parquet("/path/to/pq/output", "2j1b", "DR")
        >>> fs = FeatureSelector(df=df, labels=labels, weights=weights, corr_threshold=0.90)
        >>> fs.check_for_uniques(and_drop=True)
        """
        log.info("starting check_for_uniques step")
        uqcounts = pd.DataFrame(self.df.nunique()).T
        to_drop = []
        for col in uqcounts.columns:
            if uqcounts[col].to_numpy()[0] == 1:
                to_drop.append(col)
        if not to_drop:
            log.info("we didn't find any features with single unique values")
        if to_drop and and_drop:
            for d in to_drop:
                log.info(f"dropping {d} because it's a feature with a single unique value")
            self._df.drop(columns=to_drop, inplace=True)

[docs]    def check_collinearity(self, threshold: Optional[float] = None) -> None:
        """Calculate the correlations of the features.

        Given a correlation threshold this will construct a list of
        features that should be dropped based on the correlation
        values. This also adds a new property to the instance.

        If the ``threshold`` argument is not None then the class
        instance's ``corr_threshold`` property is updated.

        Parameters
        ----------
        threshold : float, optional
           Override the existing correlations threshold.

        Examples
        --------
        Overriding the exclusion threshold:

        >>> from tdub.features import FeatureSelector, prepare_from_parquet
        >>> df, labels, weights = prepare_from_parquet("/path/to/pq/output", "2j1b", "DR")
        >>> fs = FeatureSelector(df=df, labels=labels, weights=weights, corr_threshold=0.90)
        >>> fs.check_for_uniques(and_drop=True)
        >>> fs.corr_threshold
        0.90
        >>> fs.check_collinearity(threshold=0.85)
        >>> fs.corr_threshold
        0.85

        """
        log.info("starting check_collinearity step")
        if threshold is not None:
            self.corr_threshold = threshold

        log.info("calculating correlations")
        self._corr_matrix = self.df.corr()

        uptri = np.triu(np.ones(self.corr_matrix.shape), k=1).astype(np.bool)
        uptri = self.corr_matrix.where(uptri)
        log.info(f"testing correlations above threshold: {self.corr_threshold}")
        dropcols = [c for c in uptri.columns if any(uptri[c].abs() > self.corr_threshold)]
        log.info(f"found {len(dropcols)} features with correlations above threshold")

        self._correlated = pd.DataFrame(columns=["drop_this", "because", "coeff"])
        for col in dropcols:
            above_threshold = uptri[col].abs() > self.corr_threshold
            other_features = list(uptri.index[above_threshold])
            coeffs = list(uptri[col][above_threshold])
            this_col = [col for _ in range(len(other_features))]
            self._correlated.append(
                pd.DataFrame(dict(drop_this=this_col, because=other_features, coeff=coeffs))
            )

        log.info("correlations now calculated")

[docs]    def check_importances(
        self,
        extra_clf_opts: Optional[Dict[str, Any]] = None,
        extra_fit_opts: Optional[Dict[str, Any]] = None,
        n_fits: int = 5,
        test_size: float = 0.5,
    ) -> None:
        """Train vanilla GBDT to calculate feature importance.

        some default options are used for the
        :py:class:`lightgbm.LGBMClassifier` instance and fit (see
        implementation); you can provide extras via function some
        arguments.

        Parameters
        ----------
        extra_clf_opts : dict
           extra arguments forwarded to :py:class:`lightgbm.LGBMClassifier`.
        extra_fit_opts : dict
           extra arguments forwarded to :py:func:`lightgbm.LGBMClassifier.fit`.
        n_fits : int
           number of models to fit to determine importances
        test_size : float
           forwarded to :py:func:`sklearn.model_selection.train_test_split`

        Examples
        --------
        >>> from tdub.features import FeatureSelector, prepare_from_parquet
        >>> df, labels, weights = prepare_from_parquet("/path/to/pq/output", "2j1b", "DR")
        >>> fs = FeatureSelector(df=df, labels=labels, weights=weights, corr_threshold=0.90)
        >>> fs.check_for_uniques(and_drop=True)
        >>> fs.check_collinearity()
        >>> fs.check_importances(extra_fit_opts=dict(verbose=40, early_stopping_round=15))

        """
        import lightgbm as lgbm

        log.info("starting check_importances step")
        clf_opts = copy.deepcopy(self.default_clf_opts)
        if extra_clf_opts is not None:
            for k, v in extra_clf_opts.items():
                clf_opts[k] = v

        log.info("Classifier is configured with parameters:")
        model = lgbm.LGBMClassifier(**clf_opts)
        self._model_params = copy.deepcopy(model.get_params())
        for k, v in model.get_params().items():
            if v is not None:
                log.info(f"{k:>20} | {v:<12}")
        del model

        importance_counter = np.zeros(len(self.raw_features))

        log.info("starting importance testing training iterations")
        for i in range(1, n_fits + 1):
            log.info(f"iteration {i}/{n_fits}")
            model = lgbm.LGBMClassifier(**clf_opts)
            train_df, test_df, train_y, test_y, train_w, test_w = train_test_split(
                self.df,
                self.labels,
                self.weights,
                shuffle=True,
                test_size=test_size,
                random_state=(i * tdub.config.RANDOM_STATE),
            )
            fit_opts = dict(
                eval_metric="auc",
                eval_set=[(test_df, test_y)],
                eval_sample_weight=[test_w],
                early_stopping_rounds=10,
                verbose=20,
            )
            if extra_fit_opts is not None:
                for k, v in extra_fit_opts:
                    fit_opts[k] = v
            model.fit(train_df, train_y, sample_weight=train_w, **fit_opts)
            importance_counter += model.feature_importances_
            gc.enable()
            del train_df, test_df, train_y, test_y, train_w, test_w
            gc.collect()

        self._importances = pd.DataFrame(
            dict(feature=self.raw_features, importance=(importance_counter / n_fits))
        )
        self._importances.sort_values("importance", ascending=False, inplace=True)
        self._importances.reset_index(inplace=True)

[docs]    def check_candidates(self, n: int = 20) -> None:
        """Get the top uncorrelated features.

        This will parse the correlations and most important features
        and build a list of ordered important features. When a feature
        that should be dropped due to a collinear feature is found, we
        ensure that the more important member of the pair is included
        in the resulting list and drop the other member of the pair.
        This will populate the ``candidates`` attribute for the class.

        Parameters
        ----------
        n : int
           the total number of features to retrieve

        Examples
        --------
        >>> from tdub.features import FeatureSelector, prepare_from_parquet
        >>> df, labels, weights = prepare_from_parquet("/path/to/pq/output", "2j1b", "DR")
        >>> fs = FeatureSelector(df=df, labels=labels, weights=weights, corr_threshold=0.90)
        >>> fs.check_for_uniques(and_drop=True)
        >>> fs.check_collinearity()
        >>> fs.check_importances(extra_fit_opts=dict(verbose=40, early_stopping_round=15))
        >>> fs.check_candidates(n=25)

        """
        log.info("starting check_candidates step")
        if self._correlated is None:
            log.error("correlations are not calculated; call check_collinearity()")
            return None
        if self._importances is None:
            log.error("feature importances are not calculated; call check_importances()")
            return None

        log.info(f"checking for top {n} candidates")
        drop_because_corr = self.correlated.drop_this.to_list()
        features_ordered = self.importances.feature.to_list()
        n_top, exclude = [], []
        raw_top_n = features_ordered[:n]
        for f in raw_top_n:
            if f not in drop_because_corr:
                n_top.append(f)
                continue
            log.info(f"{f} is in the top {n}; but correlations say drop it; closer look:")
            dropped_df = self.correlations.query("drop == '{f}'")
            for corr_feat in dropped_df.because.to_list():
                if corr_feat not in drop_because_corr:
                    log.info("{corr_feat} will be kept without swap")
                if features_ordered.index(f) < features_ordered.index(corr_feat):
                    log.info("{corr_feat} to be replaced with {f}")
                    n_top.append(f)
                    exclude.append(corr_feat)

        for f in exclude:
            if f in n_top:
                n_top.remove(f)

        # use dict to ensure we drop duplicates while preserving
        # order (python3.7 insertion order is preserved).
        temp_dict = {f: None for f in n_top}
        self._candidates = list(temp_dict.keys())

[docs]    def check_iterative_remove_aucs(
        self,
        max_features: Optional[int] = None,
        extra_clf_opts: Optional[Dict[str, Any]] = None,
        extra_fit_opts: Optional[Dict[str, Any]] = None,
    ) -> None:
        """Calculate the aucs iteratively removing one feature at a time.

        After calling the check_candidates function we have a good
        sete of candidate features; this function will train vanilla
        BDTs one at a time removing one of the candidate features. We
        rank the feature based on how impactful its removal is.

        Parameters
        ----------
        max_features : int
           the maximum number of features to allow to be
           checked. default will be the length of the ``candidates``
           list.
        extra_clf_opts : dict
           extra arguments forwarded to :py:class:`lightgbm.LGBMClassifier`.
        extra_fit_opts : dict
           extra arguments forwarded to :py:func:`lightgbm.LGBMClassifier.fit`.

        Examples
        --------
        >>> from tdub.features import FeatureSelector, prepare_from_parquet
        >>> df, labels, weights = prepare_from_parquet("/path/to/pq/output", "2j1b", "DR")
        >>> fs = FeatureSelector(df=df, labels=labels, weights=weights, corr_threshold=0.90)
        >>> fs.check_for_uniques(and_drop=True)
        >>> fs.check_collinearity()
        >>> fs.check_importances(extra_fit_opts=dict(verbose=40, early_stopping_round=15))
        >>> fs.check_candidates(n=25)
        >>> fs.check_iterative_remove_aucs(max_features=20)

        """
        import lightgbm as lgbm

        log.info("Starting check_iterative_remove_aucs step")
        if self._candidates is None:
            log.error("candidates are not calculated; call check_candidates()")
            return None

        if max_features is None:
            max_features = len(self.candidates)

        train_df, test_df, train_y, test_y, train_w, test_w = train_test_split(
            self.df[self.candidates],
            self.labels,
            self.weights,
            test_size=0.33,
            random_state=tdub.config.RANDOM_STATE,
            shuffle=True,
        )

        clf_opts = copy.deepcopy(self.default_clf_opts)
        if extra_clf_opts is not None:
            for k, v in extra_clf_opts.items():
                clf_opts[k] = v

        log.info("Classifier is configured with parameters:")
        for k, v in clf_opts.items():
            log.info(f"{k:>20} | {v:<12}")

        self._iterative_remove_aucs = {}
        for i, candidate in enumerate(self.candidates):
            log.info(f"removing {candidate} and training a BDT")
            copy_of_candidates = copy.deepcopy(self.candidates[:max_features])
            copy_of_candidates.remove(candidate)
            assert len(copy_of_candidates) == len(self.candidates[:max_features]) - 1
            log.info(f"iteration {i}/{max_features}")
            ifeatures = copy_of_candidates
            itrain_df = train_df[copy_of_candidates]
            itest_df = test_df[copy_of_candidates]
            model = lgbm.LGBMClassifier(**clf_opts)
            fit_opts = dict(
                eval_metric="auc",
                eval_set=[(itest_df, test_y)],
                eval_sample_weight=[test_w],
                early_stopping_rounds=15,
                verbose=20,
            )
            if extra_fit_opts is not None:
                for k, v in extra_fit_opts:
                    fit_opts[k] = v
            model.fit(itrain_df, train_y, sample_weight=train_w, **fit_opts)
            self._iterative_remove_aucs[candidate] = model.best_score_["valid_0"]["auc"]

            gc.enable()
            del ifeatures, itrain_df, itest_df
            gc.collect()

        # sort by value (AUC)
        self._iterative_remove_aucs = {
            k: v
            for k, v in sorted(
                self._iterative_remove_aucs.items(), key=lambda item: item[1]
            )
        }

        for i, (k, v) in enumerate(self._iterative_remove_aucs.items()):
            log.info(f"without rank {i}, {k}, AUC is {v}")
        self._candidates = list(self._iterative_remove_aucs.keys())

[docs]    def check_iterative_add_aucs(
        self,
        max_features: Optional[int] = None,
        extra_clf_opts: Optional[Dict[str, Any]] = None,
        extra_fit_opts: Optional[Dict[str, Any]] = None,
    ) -> None:
        """Calculate aucs iteratively adding the next best feature.

        After calling the check_candidates function we have a good set
        of candidate features; this function will train vanilla BDTs
        iteratively including one more feater at a time starting with
        the most important.

        Parameters
        ----------
        max_features : int
           the maximum number of features to allow to be
           checked. default will be the length of the ``candidates``
           list.
        extra_clf_opts : dict
           extra arguments forwarded to :py:class:`lightgbm.LGBMClassifier`.
        extra_fit_opts : dict
           extra arguments forwarded to :py:func:`lightgbm.LGBMClassifier.fit`.

        Examples
        --------
        >>> from tdub.features import FeatureSelector, prepare_from_parquet
        >>> df, labels, weights = prepare_from_parquet("/path/to/pq/output", "2j1b", "DR")
        >>> fs = FeatureSelector(df=df, labels=labels, weights=weights, corr_threshold=0.90)
        >>> fs.check_for_uniques(and_drop=True)
        >>> fs.check_collinearity()
        >>> fs.check_importances(extra_fit_opts=dict(verbose=40, early_stopping_round=15))
        >>> fs.check_candidates(n=25)
        >>> fs.check_iterative_add_aucs(max_features=20)

        """
        import lightgbm as lgbm

        log.info("starting check_iterative_add_aucs step")
        if self._candidates is None:
            log.error("candidates are not calculated; call check_candidates()")
            return None

        if max_features is None:
            max_features = len(self.candidates)

        train_df, test_df, train_y, test_y, train_w, test_w = train_test_split(
            self.df[self.candidates],
            self.labels,
            self.weights,
            test_size=0.33,
            random_state=tdub.config.RANDOM_STATE,
            shuffle=True,
        )

        clf_opts = copy.deepcopy(self.default_clf_opts)
        if extra_clf_opts is not None:
            for k, v in extra_clf_opts.items():
                clf_opts[k] = v

        log.info("Classifier is configured with parameters:")
        for k, v in clf_opts.items():
            log.info(f"{k:>20} | {v:<12}")

        self._iterative_add_aucs = []
        for i in range(1, max_features + 1):
            log.info(f"iteration {i}/{max_features}")
            ifeatures = self.candidates[:i]
            itrain_df = train_df[ifeatures]
            itest_df = test_df[ifeatures]
            model = lgbm.LGBMClassifier(**clf_opts)
            fit_opts = dict(
                eval_metric="auc",
                eval_set=[(itest_df, test_y)],
                eval_sample_weight=[test_w],
                early_stopping_rounds=15,
                verbose=20,
            )
            if extra_fit_opts is not None:
                for k, v in extra_fit_opts:
                    fit_opts[k] = v
            model.fit(itrain_df, train_y, sample_weight=train_w, **fit_opts)
            self._iterative_add_aucs.append(model.best_score_["valid_0"]["auc"])

            gc.enable()
            del ifeatures, itrain_df, itest_df
            gc.collect()

        self._iterative_add_aucs = np.array(self._iterative_add_aucs)

[docs]    def save_result(self) -> None:
        """Save the results to a directory.

        Parameters
        ----------
        output_dir : str or os.PathLike
           the directory to save relevant results to

        Examples
        --------
        >>> from tdub.features import FeatureSelector, prepare_from_parquet
        >>> df, labels, weights = prepare_from_parquet("/path/to/pq/output", "2j1b", "DR")
        >>> fs = FeatureSelector(df=df, labels=labels, weights=weights, corr_threshold=0.90)
        >>> fs.check_for_uniques(and_drop=True)
        >>> fs.check_collinearity()
        >>> fs.check_importances(extra_fit_opts=dict(verbose=40, early_stopping_round=15))
        >>> fs.check_candidates(n=25)
        >>> fs.check_iterative_add_aucs(max_features=20)
        >>> fs.name = "2j1b_DR"
        >>> fs.save_result()

        """
        if self.name is None:
            raise ValueError("name attribute cannot be None to save result")
        outdir = PosixPath(f"fsel_result.{self.name}")
        try:
            outdir.mkdir(exist_ok=False)
        except FileExistsError:
            log.warn(f"{outdir} already exists; contents will be overwritten")

        out_raw_aucs = outdir / "raw_aucs.txt"
        out_raw_tf = outdir / "raw_top_features.txt"
        out_params = outdir / "model_params.json"
        with out_raw_aucs.open("w") as f2w:
            np.savetxt(f2w, self.iterative_add_aucs, fmt="%.10f")
        with out_raw_tf.open("w") as f2w:
            f2w.write("\n".join(self.candidates))
            f2w.write("\n")
        with out_params.open("w") as f2w:
            json.dump(self.model_params, f2w, indent=4)


[docs]def create_parquet_files(
    qf_dir: Union[str, os.PathLike],
    out_dir: Optional[Union[str, os.PathLike]] = None,
    entrysteps: Optional[Any] = None,
    use_campaign_weight: bool = False,
) -> None:
    """Create slimmed and selected parquet files from ROOT files.

    this function requires pyarrow_.

    .. _pyarrow: https://arrow.apache.org/docs/python/

    Parameters
    ----------
    qf_dir : str or os.PathLike
       directory to run :py:func:`tdub.data.quick_files`
    out_dir : str or os.PathLike, optional
       directory to save output files
    entrysteps : any, optional
       entrysteps option forwarded to
       :py:func:`tdub.frames.iterative_selection`
    use_campaign_weight : bool
       multiply the nominal weight by the campaign weight. this is
       potentially necessary if the samples were prepared without the
       campaign weight included in the product which forms the nominal
       weight

    Examples
    --------
    >>> from tdub.features import create_parquet_files
    >>> create_parquet_files("/path/to/root/files", "/path/to/pq/output", entrysteps="250 MB")

    """
    indir = str(PosixPath(qf_dir).resolve())
    qf = quick_files(indir)
    if out_dir is None:
        out_dir = PosixPath(".")
    else:
        out_dir = PosixPath(out_dir)
        out_dir.mkdir(exist_ok=True, parents=True)
    if entrysteps is None:
        entrysteps = "1 GB"

    for r in ("1j1b", "2j1b", "2j2b"):
        always_drop = ["eta_met", "bdt_response"]
        if r == "1j1b":
            always_drop.append("minimaxmbl")
        for sample in ("tW_DR", "tW_DS", "ttbar"):
            log.info(f"preparing to save a {sample} {r} parquet file using the files:")
            for f in qf[sample]:
                log.info(f" - {f}")
            df = iterative_selection(
                qf[sample],
                selection_for(r),
                keep_category="kinematics",
                concat=True,
                entrysteps=entrysteps,
                use_campaign_weight=use_campaign_weight,
            )
            df.drop_cols(*always_drop)
            df.drop_avoid(region=r)
            if r == "1j1b":
                df.drop_jet2()
            outname = str(out_dir / f"{sample}_{r}.parquet")
            df.to_parquet(outname, engine="pyarrow")
            log.info(f"{outname} saved")


[docs]def prepare_from_parquet(
    data_dir: Union[str, os.PathLike],
    region: Union[str, Region],
    nlo_method: str = "DR",
    ttbar_frac: Optional[Union[str, float]] = None,
    weight_mean: Optional[float] = None,
    weight_scale: Optional[float] = None,
    scale_sum_weights: bool = True,
    test_case_size: Optional[int] = None,
) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
    """Prepare feature selection data from parquet files.

    this function requires pyarrow_.

    .. _pyarrow: https://arrow.apache.org/docs/python/

    Parameters
    ----------
    data_dir : str or os.PathLike
       directory where the parquet files live
    region : str or tdub.data.Region
       the region where we're going to select features
    nlo_method : str
       the :math:`tW` sample (``DR`` or ``DS``)
    ttbar_frac : str or float, optional
       if not ``None``, this is the fraction of :math:`t\\bar{t}`
       events to use, "auto" (the default) uses some sensible defaults
       to fit in memory: 0.70 for 2j2b and 0.60 for 2j1b.
    weight_mean : float, optional
       scale all weights such that the mean weight is this
       value. Cannot be used with ``weight_scale``.
    weight_scale : float, optional
       value to scale all weights by, cannot be used with
       ``weight_mean``.
    scale_sum_weights : bool
       scale sum of weights of signal to be sum of weights of
       background
    test_case_size : int, optional
       if we want to perform a quick test, we use a subset of the
       data, for ``test_case_size=N`` we use ``N`` events from both
       signal and background. Cannot be used with ``ttbar_frac``.

    Returns
    -------
    pandas.DataFrame
       the dataframe which contains kinematic features
    numpy.ndarray
       the labels array for the events
    numpy.ndarray
       the weights array for the events

    Examples
    --------
    >>> from tdub.features import prepare_from_parquet
    >>> df, labels, weights = prepare_from_parquet("/path/to/pq/output", "2j1b", "DR")

    """
    if weight_scale is not None and weight_mean is not None:
        raise ValueError("weight_scale and weight_mean cannot be used together")

    if ttbar_frac is not None and test_case_size is not None:
        raise ValueError("ttbar_frac and test_case_size cannot be used together")

    data_path = PosixPath(data_dir)
    if not data_path.exists():
        raise RuntimeError(f"{data_dir} doesn't exist")
    sig_file = str(data_path / f"tW_{nlo_method}_{region}.parquet")
    bkg_file = str(data_path / f"ttbar_{region}.parquet")
    sig = pd.read_parquet(sig_file, engine="pyarrow")
    bkg = pd.read_parquet(bkg_file, engine="pyarrow")
    log.info(f"sig file loaded: {sig_file}")
    log.info(f"bkg file loaded: {bkg_file}")

    sig_wsys_cols = [c for c in sig.columns.to_list() if "weight_sys" in c]
    bkg_wsys_cols = [c for c in bkg.columns.to_list() if "weight_sys" in c]
    drop_cols(sig, *sig_wsys_cols)
    drop_cols(sig, *bkg_wsys_cols)

    for c in sig.columns.to_list():
        if c not in bkg.columns.to_list():
            log.warn(f"{c} not in bkg")
    for c in bkg.columns.to_list():
        if c not in sig.columns.to_list():
            log.warn(f"{c} not in sig")

    if ttbar_frac is not None:
        if ttbar_frac == "auto":
            if region == "2j2b":
                ttbar_frac = 0.70
            elif region == "2j1b":
                ttbar_frac = 0.60
            elif region == "1j1b":
                ttbar_frac = 1.00
        if ttbar_frac < 1:
            log.info(f"sampling a fraction ({ttbar_frac}) of the background")
            bkg = bkg.sample(frac=ttbar_frac, random_state=tdub.config.RANDOM_STATE)

    if test_case_size is not None:
        if test_case_size > 5000:
            log.warn("why bother with test_case_size > 5000?")
        sig = sig.sample(n=test_case_size, random_state=tdub.config.RANDOM_STATE)
        bkg = bkg.sample(n=test_case_size, random_state=tdub.config.RANDOM_STATE)

    sig_weights = sig.pop("weight_nominal").to_numpy()
    bkg_weights = bkg.pop("weight_nominal").to_numpy()
    sig_weights[sig_weights < 0] = 0.0
    bkg_weights[bkg_weights < 0] = 0.0
    if scale_sum_weights:
        sig_weights *= bkg_weights.sum() / sig_weights.sum()
    if "weight_campaign" in sig:
        drop_cols(sig, "weight_campaign")
    if "weight_campaign" in bkg:
        drop_cols(bkg, "weight_campaign")

    sig_labels = np.ones_like(sig_weights)
    bkg_labels = np.zeros_like(bkg_weights)

    df = pd.concat([sig, bkg])
    labels = np.concatenate([sig_labels, bkg_labels])
    weights = np.concatenate([sig_weights, bkg_weights])
    gc.enable()
    del sig, bkg, sig_labels, bkg_labels, sig_weights, bkg_weights
    gc.collect()

    if weight_scale is not None:
        weights *= weight_scale
    if weight_mean is not None:
        weights *= weight_mean * len(weights) / np.sum(weights)

    return df, labels, weights