Source code for spine.io.read.larcv

"""Contains a reader class dedicated to loading data from LArCV files."""

import os
from typing import Any

import numpy as np

from spine.utils.conditional import LARCV_AVAILABLE, ROOT, ROOT_AVAILABLE, larcv
from spine.utils.logger import logger

from .base import ReaderBase

__all__ = ["LArCVReader"]



[docs]
class LArCVReader(ReaderBase):
    """Class which reads information stored in LArCV files.

    This class inherits from the :class:`ReaderBase` class. It provides
    methods to load LArCV2 files and extract their data products:

      - EventSparseTensor: voxel IDs and their values
      - EventClusterSparseTensor: list of sparse tensors
      - EventParticle: list of Geant4 particle information
      - EventNeutrino: list of generstor neutrino information
      - EventFlash: list of optical flashes information
      - EventCRTHit: list of cosmic-ray tagger hits
      - EventTrigger: trigger information

    It builds a TChain from the list of files provided with the appropriate
    trees corresponding to each of the requested data products.
    """

    name: str = "larcv"


[docs]
    def __init__(
        self,
        file_keys: str | list[str] | None = None,
        file_list: str | None = None,
        tree_keys: list[str] | None = None,
        limit_num_files: int | None = None,
        max_print_files: int = 10,
        n_entry: int | None = None,
        n_skip: int | None = None,
        entry_list: list[int] | None = None,
        skip_entry_list: list[int] | None = None,
        run_event_list: list[list[int]] | None = None,
        skip_run_event_list: list[list[int]] | None = None,
        create_run_map: bool = False,
        run_info_key: str | None = None,
        allow_missing: bool = False,
    ) -> None:
        """Initialize the LArCV file reader.

        Parameters
        ----------
        file_keys : Union[str, List[str]], optional
            Path or list of paths to the LArCV files to be read
        file_list : str, optional
            Path to a text file containing a list of file paths to be read
        tree_keys : List[str]
            List of data keys to load from the LArCV files
        limit_num_files : Optional[int], optional
            Integer limiting number of files to be taken per data directory
        max_print_files : int, default 10
            Maximum number of loaded file names to be printed
        n_entry : Optional[int], optional
            Maximum number of entries to load
        n_skip : Optional[int], optional
            Number of entries to skip at the beginning
        entry_list : Optional[List[int]]
            List of integer entry IDs to add to the index
        skip_entry_list : Optional[List[int]]
            List of integer entry IDs to skip from the index
        run_event_list: Optional[List[List[int]]], optional
            List of (run, subrun, event) triplets to add to the index
        skip_run_event_list: Optional[List[List[int]]], optional
            List of (run, subrun, event) triplets to skip from the index
        create_run_map : bool, default False
            Initialize a map between (run, subrun, event) triplets and entries.
            For large files, this can be quite expensive (must load every entry).
        run_info_key : Optional[str], optional
            Key of the tree in the file to get the run information from
        allow_missing : bool, default False
            If `True`, allows missing entries in the entry or event list
        """
        # Check that ROOT and larcv are available
        if not ROOT_AVAILABLE:
            raise ImportError("ROOT is required to read LArCV files.")
        if not LARCV_AVAILABLE:
            raise ImportError("larcv is required to read LArCV files.")

        # Loading the larcv namespace registers the C++ dictionaries that ROOT
        # needs to cast branch objects.
        _ = larcv.__name__

        # Process the file_paths
        self.process_file_paths(file_keys, file_list, limit_num_files, max_print_files)
        if tree_keys is None or len(tree_keys) == 0:
            raise ValueError("No input `tree_keys` provided, abort.")

        # If an entry list is requested based on run/subrun/event ID, create map
        if run_event_list is not None or skip_run_event_list is not None:
            create_run_map = True

        # Prepare TTrees and load files
        self.num_entries = -1
        self.trees = {}
        self.trees_ready = False
        self.trees_pid = None
        self.file_offsets = np.empty(len(self.file_paths), dtype=np.int64)
        file_counts = []
        for key in tree_keys:
            # Check data TTree exists, and entries are identical across all
            # trees. Do not register these TTrees in yet in order to support
            # > 1 workers by the DataLoader object downstrean.
            logger.info("Loading tree %s", key)
            chain = ROOT.TChain(f"{key}_tree")  # pylint: disable=E1101
            for i, f in enumerate(self.file_paths):
                self.file_offsets[i] = chain.GetEntries()
                chain.AddFile(f)
                if key == tree_keys[0]:
                    count = chain.GetEntries() - self.file_offsets[i]
                    file_counts.append(count)

            if self.num_entries >= 0:
                if self.num_entries != chain.GetEntries():
                    raise ValueError(
                        f"Mismatch between the number of entries for {key} "
                        f"({chain.GetEntries()}) and the number of entries "
                        f"in other data products ({self.num_entries})."
                    )
            else:
                self.num_entries = chain.GetEntries()

            self.trees[key] = None
        logger.info("")

        # Dump the number of entries to load
        logger.info("Total number of entries in the file(s): %d\n", self.num_entries)

        # Build a file index
        self.file_index = np.repeat(np.arange(len(self.file_paths)), file_counts)

        # If requested, must extract the run information for each entry
        if create_run_map:
            # Initialize the TChain object
            if run_info_key is None or run_info_key not in tree_keys:
                raise ValueError(
                    "Must provide the `run_info_key` if a run map is needed. "
                    "The key must appear in the list of `tree_keys`"
                )
            chain = ROOT.TChain(f"{run_info_key}_tree")  # pylint: disable=E1101
            for f in self.file_paths:
                chain.AddFile(f)

            # Loop over entries
            self.run_info = []
            for i in range(self.num_entries):
                chain.GetEntry(i)
                info = getattr(chain, f"{run_info_key}_branch")
                self.run_info.append((info.run(), info.subrun(), info.event()))

        # Process the run information
        self.process_run_info()

        # Process the entry list
        self.process_entry_list(
            n_entry,
            n_skip,
            entry_list,
            skip_entry_list,
            run_event_list,
            skip_run_event_list,
            allow_missing,
        )



[docs]
    def get(self, idx: int) -> dict[str, Any]:
        """Returns a specific entry in the file.

        Parameters
        ----------
        idx : int
            Integer entry ID to access

        Returns
        -------
        dict
            Dictionary which maps each data product key to an entry in the tree
        """
        # Get the appropriate entry index
        if idx < 0 or idx >= len(self):
            raise IndexError(
                f"Index {idx} out of bounds for dataset of size {len(self)}."
            )
        entry_idx = self.entry_index[idx]
        file_idx = self.get_file_index(idx)
        file_entry_idx = self.get_file_entry_index(idx)

        # TChains and LArCV/PyROOT branch bindings are process-local. This
        # matters when the dataset is copied/forked into DataLoader workers:
        # the reader may have been constructed in the parent/rank process, but
        # get() runs in the worker process.
        pid = os.getpid()
        if self.trees_ready and self.trees_pid != pid:
            for key in self.trees:
                self.trees[key] = None
            self.trees_ready = False

        # If this is the first data loading, instantiate chains
        if not self.trees_ready:
            # Loading the larcv namespace registers the C++ dictionaries that
            # ROOT needs to cast branch objects. Repeat this in the process that
            # builds the read-time TChains, including DataLoader workers.
            _ = larcv.__name__
            for key in self.trees:
                chain = ROOT.TChain(f"{key}_tree")  # pylint: disable=E1101
                for f in self.file_paths:
                    chain.AddFile(f)
                self.trees[key] = chain
            self.trees_ready = True
            self.trees_pid = pid

        # Move the entry pointer
        for tree in self.trees.values():
            tree.GetEntry(int(entry_idx))

        # Load the relevant data products
        data = {
            "index": int(entry_idx),
            "file_index": file_idx,
            "file_entry_index": file_entry_idx,
        }
        data.update(self.get_source_provenance(file_idx, file_entry_idx))
        for key, tree in self.trees.items():
            data[key] = getattr(tree, f"{key}_branch")

        return data



[docs]
    @staticmethod
    def list_data(file_path: str) -> dict[str, list[str]]:
        """Dumps top-level information about the contents of a LArCV root file.

        Parameters
        ----------
        file_path : str
            Path to the file to scan

        Returns
        -------
        dict
            Dictionary which maps data types onto a list of keys
        """
        # Load up the file
        f = ROOT.TFile.Open(file_path, "r")  # pylint: disable=E1101

        # Loop over the list of keys
        data = {"sparse3d": [], "cluster3d": [], "particle": []}
        for k in f.GetListOfKeys():
            # The the key name
            name = k.GetName()

            # Only look at tree names
            if not name.endswith("_tree"):
                continue
            if len(name.split("_")) < 3:
                continue

            # Get the data type name, skip if not recognized
            key = name.split("_")[0]
            if key not in data:
                continue

            # Append this specific tree name
            data[key].append(name[: name.rfind("_")])

        return data