Skip to content

Data Loader

evals_hub.data_loader.retrieval

BioASQLoader

Loader for the BioASQ 12b dataset, following the style of TextEmbedLoader. Returns queries, documents, and relevances as HuggingFace Datasets.

Source code in evals_hub/data_loader/retrieval/bioasq.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
class BioASQLoader:
    """
    Loader for the BioASQ 12b dataset, following the style of TextEmbedLoader.
    Returns queries, documents, and relevances as HuggingFace Datasets.
    """

    metadata = {
        "path": "datasets/BioASQ_12b",
        "split": "test",
    }

    def __init__(self, split: str | None = None):
        config = load_huggingface_config()
        org_name = config.get("org_name", "")
        base_path = self.metadata["path"]
        self.path = f"{org_name}/{base_path}" if org_name else base_path

        self.split = split if split else self.metadata["split"]

    def load_data(self) -> tuple[Dataset, Dataset, Dataset]:
        data = load_dataset(self.path, split=self.split)
        queries = self._load_queries(data)

        # parsing text from snippets and use for generating documents and relevances
        all_text_df = self._get_all_text_df(data)
        documents = self._load_documents(all_text_df)
        relevances = self._load_relevances(all_text_df)
        return queries, documents, relevances

    def _load_queries(self, data: Dataset) -> Dataset:
        queries = data.select_columns(["id", "body"])
        queries = queries.rename_columns({"id": "_id", "body": "query"})
        return queries

    def _extract_snippet_texts(self, item: dict[str, Any]) -> dict[str, Any]:
        all_texts = []
        snippets = item["snippets"]
        for s in snippets:
            # the last part of document is ID. For example, http://www.ncbi.nlm.nih.gov/pubmed/15858239
            rel = (
                item["id"],
                f"{s['document'].split('/')[-1]}_{s['offsetInBeginSection']}_{s['offsetInEndSection']}",
                s["text"],
            )
            all_texts.append(rel)
        return {"all_text": all_texts}

    def _get_all_text_df(self, data: Dataset) -> pl.DataFrame:
        """
        Extracts all snippet texts and returns a polars DataFrame with columns [query_id, doc_id, doc].
        """
        data = data.map(self._extract_snippet_texts)
        flattened_all_text = list(itertools.chain.from_iterable(data["all_text"]))
        all_text_df = pl.DataFrame(
            flattened_all_text, schema=["query_id", "doc_id", "doc"], orient="row"
        )
        return all_text_df

    def _load_documents(self, all_text_df: pl.DataFrame) -> Dataset:
        # deduplicate documents based on doc, then doc_id.
        # Maintain the order of is important, otherwise the result is not deterministic.
        # And we need to run unique twice and separately for doc and _id.
        # As there are minor errors in the dataset, same ID but with different docs.
        # Note that the _id is doc_id + offsetInBeginSection + offsetInEndSection.
        # For example, docA_0_12 may contain the same doc as docA_0_13 due to the error in dataset.
        documents = (
            all_text_df.select(["doc_id", "doc"])
            .rename({"doc_id": "_id"})
            .unique(subset=["doc"], maintain_order=True)
            .unique(subset=["_id"], maintain_order=True)
        )
        documents = Dataset.from_polars(documents)
        return documents

    def _load_relevances(self, all_text_df: pl.DataFrame) -> Dataset:
        all_text_df = all_text_df.with_columns(pl.lit(1).alias("score"))
        relevances = Dataset.from_polars(
            all_text_df.select(["query_id", "doc_id", "score"])
        )
        return relevances

NFCorpusLoader

Source code in evals_hub/data_loader/retrieval/nfcorpus.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
class NFCorpusLoader:
    # direct links (2), indirect links (1), marginally relevant and others (0, not in the files).
    metadata = {
        "path": "nfcorpus_gold",
        "split": "test",
    }

    def __init__(self, split: str | None = None, ssl_verify: bool = True):
        """
        Initialize the NFCorpusLoader with metadata.
        """
        config = load_huggingface_config()
        org_name = config.get("org_name", "")
        base_path = self.metadata["path"]
        self.path = f"{org_name}/{base_path}" if org_name else base_path
        self.split = split if split else self.metadata["split"]
        if ssl_verify is False:
            configure_http_backend(backend_factory=backend_factory)

    def load_data(self) -> tuple[Dataset, Dataset, Dataset]:
        """
        Load the NFCorpus dataset.
        """
        queries = load_dataset(self.path, "queries", split="queries")
        documents = load_dataset(self.path, "documents", split="documents")
        relevances = load_dataset(self.path, "relevances", split=self.split)
        return queries, documents, relevances

__init__(split=None, ssl_verify=True)

Initialize the NFCorpusLoader with metadata.

Source code in evals_hub/data_loader/retrieval/nfcorpus.py
13
14
15
16
17
18
19
20
21
22
23
def __init__(self, split: str | None = None, ssl_verify: bool = True):
    """
    Initialize the NFCorpusLoader with metadata.
    """
    config = load_huggingface_config()
    org_name = config.get("org_name", "")
    base_path = self.metadata["path"]
    self.path = f"{org_name}/{base_path}" if org_name else base_path
    self.split = split if split else self.metadata["split"]
    if ssl_verify is False:
        configure_http_backend(backend_factory=backend_factory)

load_data()

Load the NFCorpus dataset.

Source code in evals_hub/data_loader/retrieval/nfcorpus.py
25
26
27
28
29
30
31
32
def load_data(self) -> tuple[Dataset, Dataset, Dataset]:
    """
    Load the NFCorpus dataset.
    """
    queries = load_dataset(self.path, "queries", split="queries")
    documents = load_dataset(self.path, "documents", split="documents")
    relevances = load_dataset(self.path, "relevances", split=self.split)
    return queries, documents, relevances

evals_hub.data_loader.reranking

AlloprofLoader

Source code in evals_hub/data_loader/reranking/alloprof.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
class AlloprofLoader:
    metadata = {
        "path": "fr-reranking-alloprof-s2p_gold",
        "split": "test",
    }

    def __init__(self, split: str | None = None, ssl_verify: bool = True):
        """
        Initialize the AlloprofLoader with metadata.
        """
        config = load_huggingface_config()
        org_name = config.get("org_name", "")
        base_path = self.metadata["path"]
        self.path = f"{org_name}/{base_path}" if org_name else base_path
        self.split = split if split else self.metadata["split"]
        if ssl_verify is False:
            configure_http_backend(backend_factory=backend_factory)

    def load_data(self) -> tuple[Dataset]:
        """
        Load the Alloprof dataset.
        """
        queries = load_dataset(self.path, "queries", split="test")
        return queries

__init__(split=None, ssl_verify=True)

Initialize the AlloprofLoader with metadata.

Source code in evals_hub/data_loader/reranking/alloprof.py
12
13
14
15
16
17
18
19
20
21
22
def __init__(self, split: str | None = None, ssl_verify: bool = True):
    """
    Initialize the AlloprofLoader with metadata.
    """
    config = load_huggingface_config()
    org_name = config.get("org_name", "")
    base_path = self.metadata["path"]
    self.path = f"{org_name}/{base_path}" if org_name else base_path
    self.split = split if split else self.metadata["split"]
    if ssl_verify is False:
        configure_http_backend(backend_factory=backend_factory)

load_data()

Load the Alloprof dataset.

Source code in evals_hub/data_loader/reranking/alloprof.py
24
25
26
27
28
29
def load_data(self) -> tuple[Dataset]:
    """
    Load the Alloprof dataset.
    """
    queries = load_dataset(self.path, "queries", split="test")
    return queries

evals_hub.data_loader.classification

ClassificationLoader

Bases: Protocol

Protocol for a classification loader. Requires the fields query and label.

Source code in evals_hub/data_loader/classification/protocol.py
 6
 7
 8
 9
10
11
class ClassificationLoader(Protocol):
    """
    Protocol for a classification loader. Requires the fields query and label.
    """

    def load_data(self) -> tuple[Dataset, Dataset]: ...

AmazonCounterFactualLoader

A loader for the Amazon Counterfactual dataset - label: int — 0 (not-counterfactual) or 1 (counterfactual)

Source code in evals_hub/data_loader/classification/amazon_counterfactual.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
class AmazonCounterFactualLoader:
    """
    A loader for the Amazon Counterfactual dataset
    - label: int — 0 (not-counterfactual) or 1 (counterfactual)
    """

    metadata = {
        "path": "amazon_counterfactual_gold",
        "split": "test",
    }

    def __init__(
        self,
        split: str | None = None,
    ):
        config = load_huggingface_config()
        org_name = config.get("org_name", "")
        base_path = self.metadata["path"]
        self.path = f"{org_name}/{base_path}" if org_name else base_path
        self.split = split if split else self.metadata["split"]

    def load_data(self) -> tuple[Dataset, Dataset]:
        """
        Load the AmazonCounterFactual dataset.
        """
        data = load_dataset(self.path, split=self.split)
        queries = data.select_columns("query")
        labels = data.select_columns("label")
        return queries, labels

load_data()

Load the AmazonCounterFactual dataset.

Source code in evals_hub/data_loader/classification/amazon_counterfactual.py
29
30
31
32
33
34
35
36
def load_data(self) -> tuple[Dataset, Dataset]:
    """
    Load the AmazonCounterFactual dataset.
    """
    data = load_dataset(self.path, split=self.split)
    queries = data.select_columns("query")
    labels = data.select_columns("label")
    return queries, labels

evals_hub.data_loader.nli

NLILoader

Bases: Protocol

Protocol for an NLI dataset loader. Requires the fields hypothesis, premise, and label

Source code in evals_hub/data_loader/nli/protocol.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
class NLILoader(Protocol):
    """
    Protocol for an NLI dataset loader. Requires
    the fields hypothesis, premise, and label
    """

    def load_data(self) -> tuple[Dataset, Dataset]: ...

    def _load_hypotheses(self) -> Dataset: ...

    def _load_premises(self) -> Dataset: ...

    def _load_labels(self) -> Dataset: ...

SciFactNLILoader

Load SciFact entailment dataset. Returns hypotheses, premises and labels as HuggingFace datasets.

Source code in evals_hub/data_loader/nli/scifact.py
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
class SciFactNLILoader:
    """
    Load SciFact entailment dataset. Returns hypotheses, premises and labels
    as HuggingFace datasets.
    """

    metadata = {
        "path": "allenai/scifact_entailment",
        "split": "validation",
        "hf_subset": "en",
    }
    LABELS = {"CONTRADICT": 0, "SUPPORT": 1, "NEI": 2}

    def __init__(
        self,
        split: str | None = None,
        seed: int | None = None,
        hf_subset: str | None = None,
    ):
        """
        Args:
            split (str): The split to evaluate
            seed (int): A seed to use for reproducibility
            hf_subset (str): The language to evaluate
        """
        self.path = self.metadata["path"]
        self.split = split if split else self.metadata["split"]
        self.hf_subset = hf_subset if hf_subset else self.metadata["hf_subset"]

        # Set seed for reproducibility
        self._seed = seed or 42

    @staticmethod
    def _generate_evidence_references(data: pl.DataFrame, seed: int) -> pl.DataFrame:
        """
        Randomly sample abstract sentences for NEI cases for training, treating these cases as
        `CONTRADICT`, as seen in the original SciFact paper. Either 1 or 2 sentences are chosen from each NEI abstract with probability 0.5.

        Args:
            data (pl.DataFrame): the original AllenAI SciFact entailment dataset, as a Polars DataFrame
            seed (int): A seed used for reproducibility
        """
        rng = np.random.default_rng(seed)
        n_nei_samples = rng.integers(1, 3, size=data.height)

        # Prepare the data for sampling for the NEI cases
        data_with_samples = (
            data.with_columns(n_nei_samples=n_nei_samples)
            .with_columns(
                pl.struct(["verdict", "abstract", "evidence", "n_nei_samples"]).alias(
                    "sample_config"
                )
            )
            .with_columns(
                pl.col("sample_config")
                .map_elements(
                    # In the case where verdict is NEI, sample 1 or 2 sentences at random
                    # from the abstract
                    lambda row: rng.integers(
                        0, len(row["abstract"]), size=row["n_nei_samples"]
                    )
                    if row["verdict"] == "NEI"
                    # Otherwise, keep the evidence as is
                    else row["evidence"],
                    return_dtype=pl.List(pl.Int64),
                )
                .alias("evidence")
            )
        )
        return data_with_samples.drop("sample_config", "n_nei_samples")

    @staticmethod
    def _flatten_premises(data: pl.DataFrame) -> pl.DataFrame:
        return data.explode("evidence").with_columns(
            pl.col("abstract")
            .list.get(pl.col("evidence"), null_on_oob=False)
            .alias("premise"),
            pl.col("claim").alias("hypothesis"),
            "verdict",
        )

    def _load_data(self) -> Dataset:
        scifact_dataset = (
            load_dataset(self.path, name=self.hf_subset, split=self.split)
            .with_format("polars")
            .map(
                self._generate_evidence_references,
                fn_kwargs={"seed": self._seed},
                batched=True,
            )
            .map(self._flatten_premises, batched=True)
            .with_format(None)
            .map(lambda row: {"label": self.LABELS[row["verdict"]]})
            .select_columns(["premise", "hypothesis", "label"])
        )

        if self.split != "train":
            # Removing NEI samples when not in training set
            scifact_dataset = scifact_dataset.filter(lambda row: row["label"] in (0, 2))

        scifact_dataset = scifact_dataset.map(
            lambda row: {"label": 0 if row["label"] == 2 else 1}
        )
        return scifact_dataset

    def load_data(self) -> tuple[Dataset, Dataset, Dataset]:
        premises = self._load_premises()
        hypotheses = self._load_hypotheses()
        labels = self._load_labels()
        return premises, hypotheses, labels

    def _load_premises(self) -> Dataset:
        premises = self._load_data().select_columns(["premise"])
        return premises

    def _load_hypotheses(self) -> Dataset:
        hypotheses = self._load_data().select_columns(["hypothesis"])
        return hypotheses

    def _load_labels(self) -> Dataset:
        labels = self._load_data().select_columns(["label"])
        return labels

__init__(split=None, seed=None, hf_subset=None)

Parameters:

Name Type Description Default
split str

The split to evaluate

None
seed int

A seed to use for reproducibility

None
hf_subset str

The language to evaluate

None
Source code in evals_hub/data_loader/nli/scifact.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def __init__(
    self,
    split: str | None = None,
    seed: int | None = None,
    hf_subset: str | None = None,
):
    """
    Args:
        split (str): The split to evaluate
        seed (int): A seed to use for reproducibility
        hf_subset (str): The language to evaluate
    """
    self.path = self.metadata["path"]
    self.split = split if split else self.metadata["split"]
    self.hf_subset = hf_subset if hf_subset else self.metadata["hf_subset"]

    # Set seed for reproducibility
    self._seed = seed or 42

XNLILoader

Load the MTEB XNLI dataset. Returns hypotheses, premises and labels.

Source code in evals_hub/data_loader/nli/xnli.py
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
class XNLILoader:
    """
    Load the MTEB XNLI dataset. Returns hypotheses, premises and labels.
    """

    metadata = {
        "path": "xnli_gold",
        "split": "test",
        "hf_subset": "en",
    }

    def __init__(
        self,
        split: str | None = None,
        hf_subset: str | None = None,
    ):
        """
        Args:
            path (str): The path to the dataset
            split (str, optional): The split of the dataset to use. Defaults to "test".
            hf_subset (str, optional): The HuggingFace subset to use. Defaults to "en".

        """
        config = load_huggingface_config()
        org_name = config.get("org_name", "")
        base_path = self.metadata["path"]
        self.path = f"{org_name}/{base_path}" if org_name else base_path
        self.split = split if split else self.metadata["split"]
        self.hf_subset = hf_subset if hf_subset else self.metadata["hf_subset"]

    def load_data(self) -> tuple[Dataset, Dataset, Dataset]:
        data = load_dataset(self.path, name=self.hf_subset, split=self.split)

        premises = data.select_columns(["premise"])
        hypotheses = data.select_columns(["hypothesis"])
        labels = data.select_columns(["label"])

        return premises, hypotheses, labels

__init__(split=None, hf_subset=None)

Parameters:

Name Type Description Default
path str

The path to the dataset

required
split str

The split of the dataset to use. Defaults to "test".

None
hf_subset str

The HuggingFace subset to use. Defaults to "en".

None
Source code in evals_hub/data_loader/nli/xnli.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
def __init__(
    self,
    split: str | None = None,
    hf_subset: str | None = None,
):
    """
    Args:
        path (str): The path to the dataset
        split (str, optional): The split of the dataset to use. Defaults to "test".
        hf_subset (str, optional): The HuggingFace subset to use. Defaults to "en".

    """
    config = load_huggingface_config()
    org_name = config.get("org_name", "")
    base_path = self.metadata["path"]
    self.path = f"{org_name}/{base_path}" if org_name else base_path
    self.split = split if split else self.metadata["split"]
    self.hf_subset = hf_subset if hf_subset else self.metadata["hf_subset"]