Data Loader

evals_hub.data_loader.retrieval

`BioASQLoader`

Loader for the BioASQ 12b dataset, following the style of TextEmbedLoader. Returns queries, documents, and relevances as HuggingFace Datasets.

Source code in evals_hub/data_loader/retrieval/bioasq.py

class BioASQLoader:
    """
    Loader for the BioASQ 12b dataset, following the style of TextEmbedLoader.
    Returns queries, documents, and relevances as HuggingFace Datasets.
    """

    metadata = {
        "path": "datasets/BioASQ_12b",
        "split": "test",
    }

    def __init__(self, split: str | None = None):
        config = load_huggingface_config()
        org_name = config.get("org_name", "")
        base_path = self.metadata["path"]
        self.path = f"{org_name}/{base_path}" if org_name else base_path

        self.split = split if split else self.metadata["split"]

    def load_data(self) -> tuple[Dataset, Dataset, Dataset]:
        data = load_dataset(self.path, split=self.split)
        queries = self._load_queries(data)

        # parsing text from snippets and use for generating documents and relevances
        all_text_df = self._get_all_text_df(data)
        documents = self._load_documents(all_text_df)
        relevances = self._load_relevances(all_text_df)
        return queries, documents, relevances

    def _load_queries(self, data: Dataset) -> Dataset:
        queries = data.select_columns(["id", "body"])
        queries = queries.rename_columns({"id": "_id", "body": "query"})
        return queries

    def _extract_snippet_texts(self, item: dict[str, Any]) -> dict[str, Any]:
        all_texts = []
        snippets = item["snippets"]
        for s in snippets:
            # the last part of document is ID. For example, http://www.ncbi.nlm.nih.gov/pubmed/15858239
            rel = (
                item["id"],
                f"{s['document'].split('/')[-1]}_{s['offsetInBeginSection']}_{s['offsetInEndSection']}",
                s["text"],
            )
            all_texts.append(rel)
        return {"all_text": all_texts}

    def _get_all_text_df(self, data: Dataset) -> pl.DataFrame:
        """
        Extracts all snippet texts and returns a polars DataFrame with columns [query_id, doc_id, doc].
        """
        data = data.map(self._extract_snippet_texts)
        flattened_all_text = list(itertools.chain.from_iterable(data["all_text"]))
        all_text_df = pl.DataFrame(
            flattened_all_text, schema=["query_id", "doc_id", "doc"], orient="row"
        )
        return all_text_df

    def _load_documents(self, all_text_df: pl.DataFrame) -> Dataset:
        # deduplicate documents based on doc, then doc_id.
        # Maintain the order of is important, otherwise the result is not deterministic.
        # And we need to run unique twice and separately for doc and _id.
        # As there are minor errors in the dataset, same ID but with different docs.
        # Note that the _id is doc_id + offsetInBeginSection + offsetInEndSection.
        # For example, docA_0_12 may contain the same doc as docA_0_13 due to the error in dataset.
        documents = (
            all_text_df.select(["doc_id", "doc"])
            .rename({"doc_id": "_id"})
            .unique(subset=["doc"], maintain_order=True)
            .unique(subset=["_id"], maintain_order=True)
        )
        documents = Dataset.from_polars(documents)
        return documents

    def _load_relevances(self, all_text_df: pl.DataFrame) -> Dataset:
        all_text_df = all_text_df.with_columns(pl.lit(1).alias("score"))
        relevances = Dataset.from_polars(
            all_text_df.select(["query_id", "doc_id", "score"])
        )
        return relevances

`NFCorpusLoader`

Source code in evals_hub/data_loader/retrieval/nfcorpus.py

class NFCorpusLoader:
    # direct links (2), indirect links (1), marginally relevant and others (0, not in the files).
    metadata = {
        "path": "nfcorpus_gold",
        "split": "test",
    }

    def __init__(self, split: str | None = None, ssl_verify: bool = True):
        """
        Initialize the NFCorpusLoader with metadata.
        """
        config = load_huggingface_config()
        org_name = config.get("org_name", "")
        base_path = self.metadata["path"]
        self.path = f"{org_name}/{base_path}" if org_name else base_path
        self.split = split if split else self.metadata["split"]
        if ssl_verify is False:
            configure_http_backend(backend_factory=backend_factory)

    def load_data(self) -> tuple[Dataset, Dataset, Dataset]:
        """
        Load the NFCorpus dataset.
        """
        queries = load_dataset(self.path, "queries", split="queries")
        documents = load_dataset(self.path, "documents", split="documents")
        relevances = load_dataset(self.path, "relevances", split=self.split)
        return queries, documents, relevances

`init(split=None, ssl_verify=True)`

Initialize the NFCorpusLoader with metadata.

Source code in evals_hub/data_loader/retrieval/nfcorpus.py

def __init__(self, split: str | None = None, ssl_verify: bool = True):
    """
    Initialize the NFCorpusLoader with metadata.
    """
    config = load_huggingface_config()
    org_name = config.get("org_name", "")
    base_path = self.metadata["path"]
    self.path = f"{org_name}/{base_path}" if org_name else base_path
    self.split = split if split else self.metadata["split"]
    if ssl_verify is False:
        configure_http_backend(backend_factory=backend_factory)

`load_data()`

Load the NFCorpus dataset.

Source code in evals_hub/data_loader/retrieval/nfcorpus.py

def load_data(self) -> tuple[Dataset, Dataset, Dataset]:
    """
    Load the NFCorpus dataset.
    """
    queries = load_dataset(self.path, "queries", split="queries")
    documents = load_dataset(self.path, "documents", split="documents")
    relevances = load_dataset(self.path, "relevances", split=self.split)
    return queries, documents, relevances

evals_hub.data_loader.reranking

`AlloprofLoader`

Source code in evals_hub/data_loader/reranking/alloprof.py

class AlloprofLoader:
    metadata = {
        "path": "fr-reranking-alloprof-s2p_gold",
        "split": "test",
    }

    def __init__(self, split: str | None = None, ssl_verify: bool = True):
        """
        Initialize the AlloprofLoader with metadata.
        """
        config = load_huggingface_config()
        org_name = config.get("org_name", "")
        base_path = self.metadata["path"]
        self.path = f"{org_name}/{base_path}" if org_name else base_path
        self.split = split if split else self.metadata["split"]
        if ssl_verify is False:
            configure_http_backend(backend_factory=backend_factory)

    def load_data(self) -> tuple[Dataset]:
        """
        Load the Alloprof dataset.
        """
        queries = load_dataset(self.path, "queries", split="test")
        return queries

`init(split=None, ssl_verify=True)`

Initialize the AlloprofLoader with metadata.

Source code in evals_hub/data_loader/reranking/alloprof.py

def __init__(self, split: str | None = None, ssl_verify: bool = True):
    """
    Initialize the AlloprofLoader with metadata.
    """
    config = load_huggingface_config()
    org_name = config.get("org_name", "")
    base_path = self.metadata["path"]
    self.path = f"{org_name}/{base_path}" if org_name else base_path
    self.split = split if split else self.metadata["split"]
    if ssl_verify is False:
        configure_http_backend(backend_factory=backend_factory)

`load_data()`

Load the Alloprof dataset.

Source code in evals_hub/data_loader/reranking/alloprof.py

def load_data(self) -> tuple[Dataset]:
    """
    Load the Alloprof dataset.
    """
    queries = load_dataset(self.path, "queries", split="test")
    return queries

evals_hub.data_loader.classification

`ClassificationLoader`

Bases: Protocol

Protocol for a classification loader. Requires the fields query and label.

Source code in evals_hub/data_loader/classification/protocol.py

class ClassificationLoader(Protocol):
    """
    Protocol for a classification loader. Requires the fields query and label.
    """

    def load_data(self) -> tuple[Dataset, Dataset]: ...

`AmazonCounterFactualLoader`

A loader for the Amazon Counterfactual dataset - label: int — 0 (not-counterfactual) or 1 (counterfactual)

Source code in evals_hub/data_loader/classification/amazon_counterfactual.py

class AmazonCounterFactualLoader:
    """
    A loader for the Amazon Counterfactual dataset
    - label: int — 0 (not-counterfactual) or 1 (counterfactual)
    """

    metadata = {
        "path": "amazon_counterfactual_gold",
        "split": "test",
    }

    def __init__(
        self,
        split: str | None = None,
    ):
        config = load_huggingface_config()
        org_name = config.get("org_name", "")
        base_path = self.metadata["path"]
        self.path = f"{org_name}/{base_path}" if org_name else base_path
        self.split = split if split else self.metadata["split"]

    def load_data(self) -> tuple[Dataset, Dataset]:
        """
        Load the AmazonCounterFactual dataset.
        """
        data = load_dataset(self.path, split=self.split)
        queries = data.select_columns("query")
        labels = data.select_columns("label")
        return queries, labels

`load_data()`

Load the AmazonCounterFactual dataset.

Source code in evals_hub/data_loader/classification/amazon_counterfactual.py

def load_data(self) -> tuple[Dataset, Dataset]:
    """
    Load the AmazonCounterFactual dataset.
    """
    data = load_dataset(self.path, split=self.split)
    queries = data.select_columns("query")
    labels = data.select_columns("label")
    return queries, labels

evals_hub.data_loader.nli

`NLILoader`

Bases: Protocol

Protocol for an NLI dataset loader. Requires the fields hypothesis, premise, and label

Source code in evals_hub/data_loader/nli/protocol.py

class NLILoader(Protocol):
    """
    Protocol for an NLI dataset loader. Requires
    the fields hypothesis, premise, and label
    """

    def load_data(self) -> tuple[Dataset, Dataset]: ...

    def _load_hypotheses(self) -> Dataset: ...

    def _load_premises(self) -> Dataset: ...

    def _load_labels(self) -> Dataset: ...

`SciFactNLILoader`

Load SciFact entailment dataset. Returns hypotheses, premises and labels as HuggingFace datasets.

Source code in evals_hub/data_loader/nli/scifact.py

class SciFactNLILoader:
    """
    Load SciFact entailment dataset. Returns hypotheses, premises and labels
    as HuggingFace datasets.
    """

    metadata = {
        "path": "allenai/scifact_entailment",
        "split": "validation",
        "hf_subset": "en",
    }
    LABELS = {"CONTRADICT": 0, "SUPPORT": 1, "NEI": 2}

    def __init__(
        self,
        split: str | None = None,
        seed: int | None = None,
        hf_subset: str | None = None,
    ):
        """
        Args:
            split (str): The split to evaluate
            seed (int): A seed to use for reproducibility
            hf_subset (str): The language to evaluate
        """
        self.path = self.metadata["path"]
        self.split = split if split else self.metadata["split"]
        self.hf_subset = hf_subset if hf_subset else self.metadata["hf_subset"]

        # Set seed for reproducibility
        self._seed = seed or 42

    @staticmethod
    def _generate_evidence_references(data: pl.DataFrame, seed: int) -> pl.DataFrame:
        """
        Randomly sample abstract sentences for NEI cases for training, treating these cases as
        `CONTRADICT`, as seen in the original SciFact paper. Either 1 or 2 sentences are chosen from each NEI abstract with probability 0.5.

        Args:
            data (pl.DataFrame): the original AllenAI SciFact entailment dataset, as a Polars DataFrame
            seed (int): A seed used for reproducibility
        """
        rng = np.random.default_rng(seed)
        n_nei_samples = rng.integers(1, 3, size=data.height)

        # Prepare the data for sampling for the NEI cases
        data_with_samples = (
            data.with_columns(n_nei_samples=n_nei_samples)
            .with_columns(
                pl.struct(["verdict", "abstract", "evidence", "n_nei_samples"]).alias(
                    "sample_config"
                )
            )
            .with_columns(
                pl.col("sample_config")
                .map_elements(
                    # In the case where verdict is NEI, sample 1 or 2 sentences at random
                    # from the abstract
                    lambda row: rng.integers(
                        0, len(row["abstract"]), size=row["n_nei_samples"]
                    )
                    if row["verdict"] == "NEI"
                    # Otherwise, keep the evidence as is
                    else row["evidence"],
                    return_dtype=pl.List(pl.Int64),
                )
                .alias("evidence")
            )
        )
        return data_with_samples.drop("sample_config", "n_nei_samples")

    @staticmethod
    def _flatten_premises(data: pl.DataFrame) -> pl.DataFrame:
        return data.explode("evidence").with_columns(
            pl.col("abstract")
            .list.get(pl.col("evidence"), null_on_oob=False)
            .alias("premise"),
            pl.col("claim").alias("hypothesis"),
            "verdict",
        )

    def _load_data(self) -> Dataset:
        scifact_dataset = (
            load_dataset(self.path, name=self.hf_subset, split=self.split)
            .with_format("polars")
            .map(
                self._generate_evidence_references,
                fn_kwargs={"seed": self._seed},
                batched=True,
            )
            .map(self._flatten_premises, batched=True)
            .with_format(None)
            .map(lambda row: {"label": self.LABELS[row["verdict"]]})
            .select_columns(["premise", "hypothesis", "label"])
        )

        if self.split != "train":
            # Removing NEI samples when not in training set
            scifact_dataset = scifact_dataset.filter(lambda row: row["label"] in (0, 2))

        scifact_dataset = scifact_dataset.map(
            lambda row: {"label": 0 if row["label"] == 2 else 1}
        )
        return scifact_dataset

    def load_data(self) -> tuple[Dataset, Dataset, Dataset]:
        premises = self._load_premises()
        hypotheses = self._load_hypotheses()
        labels = self._load_labels()
        return premises, hypotheses, labels

    def _load_premises(self) -> Dataset:
        premises = self._load_data().select_columns(["premise"])
        return premises

    def _load_hypotheses(self) -> Dataset:
        hypotheses = self._load_data().select_columns(["hypothesis"])
        return hypotheses

    def _load_labels(self) -> Dataset:
        labels = self._load_data().select_columns(["label"])
        return labels

`init(split=None, seed=None, hf_subset=None)`

Parameters:

Name	Type	Description	Default
`split`	`str`	The split to evaluate	`None`
`seed`	`int`	A seed to use for reproducibility	`None`
`hf_subset`	`str`	The language to evaluate	`None`

Source code in evals_hub/data_loader/nli/scifact.py

def __init__(
    self,
    split: str | None = None,
    seed: int | None = None,
    hf_subset: str | None = None,
):
    """
    Args:
        split (str): The split to evaluate
        seed (int): A seed to use for reproducibility
        hf_subset (str): The language to evaluate
    """
    self.path = self.metadata["path"]
    self.split = split if split else self.metadata["split"]
    self.hf_subset = hf_subset if hf_subset else self.metadata["hf_subset"]

    # Set seed for reproducibility
    self._seed = seed or 42

`XNLILoader`

Load the MTEB XNLI dataset. Returns hypotheses, premises and labels.

Source code in evals_hub/data_loader/nli/xnli.py

class XNLILoader:
    """
    Load the MTEB XNLI dataset. Returns hypotheses, premises and labels.
    """

    metadata = {
        "path": "xnli_gold",
        "split": "test",
        "hf_subset": "en",
    }

    def __init__(
        self,
        split: str | None = None,
        hf_subset: str | None = None,
    ):
        """
        Args:
            path (str): The path to the dataset
            split (str, optional): The split of the dataset to use. Defaults to "test".
            hf_subset (str, optional): The HuggingFace subset to use. Defaults to "en".

        """
        config = load_huggingface_config()
        org_name = config.get("org_name", "")
        base_path = self.metadata["path"]
        self.path = f"{org_name}/{base_path}" if org_name else base_path
        self.split = split if split else self.metadata["split"]
        self.hf_subset = hf_subset if hf_subset else self.metadata["hf_subset"]

    def load_data(self) -> tuple[Dataset, Dataset, Dataset]:
        data = load_dataset(self.path, name=self.hf_subset, split=self.split)

        premises = data.select_columns(["premise"])
        hypotheses = data.select_columns(["hypothesis"])
        labels = data.select_columns(["label"])

        return premises, hypotheses, labels

`init(split=None, hf_subset=None)`

Parameters:

Name	Type	Description	Default
`path`	`str`	The path to the dataset	required
`split`	`str`	The split of the dataset to use. Defaults to "test".	`None`
`hf_subset`	`str`	The HuggingFace subset to use. Defaults to "en".	`None`

Source code in evals_hub/data_loader/nli/xnli.py

def __init__(
    self,
    split: str | None = None,
    hf_subset: str | None = None,
):
    """
    Args:
        path (str): The path to the dataset
        split (str, optional): The split of the dataset to use. Defaults to "test".
        hf_subset (str, optional): The HuggingFace subset to use. Defaults to "en".

    """
    config = load_huggingface_config()
    org_name = config.get("org_name", "")
    base_path = self.metadata["path"]
    self.path = f"{org_name}/{base_path}" if org_name else base_path
    self.split = split if split else self.metadata["split"]
    self.hf_subset = hf_subset if hf_subset else self.metadata["hf_subset"]

Data Loader

BioASQLoader

NFCorpusLoader

__init__(split=None, ssl_verify=True)

load_data()

AlloprofLoader

__init__(split=None, ssl_verify=True)

load_data()

ClassificationLoader

AmazonCounterFactualLoader

load_data()

NLILoader

SciFactNLILoader

__init__(split=None, seed=None, hf_subset=None)

XNLILoader

__init__(split=None, hf_subset=None)

`BioASQLoader`

`NFCorpusLoader`

`init(split=None, ssl_verify=True)`

`load_data()`

`AlloprofLoader`

`init(split=None, ssl_verify=True)`

`load_data()`

`ClassificationLoader`

`AmazonCounterFactualLoader`

`load_data()`

`NLILoader`

`SciFactNLILoader`

`init(split=None, seed=None, hf_subset=None)`

`XNLILoader`

`init(split=None, hf_subset=None)`