Skip to content

Commit 95cc5e6

Browse files
committed
Updated docstrings
1 parent 653dccc commit 95cc5e6

2 files changed

Lines changed: 6 additions & 18 deletions

File tree

semhash/semhash.py

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def from_records(
5656
"""
5757
Initialize a SemHash instance from records.
5858
59-
This removes exact duplicates, featurizes the records, and fits a vicinity index.
59+
Removes exact duplicates, featurizes the records, and fits a vicinity index.
6060
6161
:param records: A list of records (strings or dictionaries).
6262
:param columns: Columns to featurize if records are dictionaries.
@@ -93,15 +93,8 @@ def from_dataset(
9393
"""
9494
Initialize SemHash from a dataset (e.g., HuggingFace Dataset).
9595
96-
Extracts records from the dataset, deduplicates them, and embeds only
97-
representative records (not duplicates). The encoder controls batching internally.
98-
99-
Supports any dataset-like object that provides:
100-
- column_names: Sequence[str]
101-
- __len__() -> int
102-
- __getitem__(column_name: str) -> Sequence[Any] (columnar access)
103-
104-
HuggingFace datasets.Dataset satisfies this contract, but custom implementations work too.
96+
Removes exact duplicates, featurizes the records, and fits a vicinity index.
97+
Supports any dataset-like object that follows the DatasetLike protocol.
10598
10699
:param dataset: A dataset-like object with columnar access.
107100
:param columns: Columns to use for deduplication (same as from_records).
@@ -117,7 +110,7 @@ def from_dataset(
117110
# Extract, validate, and deduplicate dataset records
118111
deduplicated_records, items, was_string = prepare_dataset_records(dataset, columns)
119112

120-
# Embed representatives only (encoder decides batching internally)
113+
# Create embeddings for deduplicated records only
121114
vectors = featurize(records=deduplicated_records, columns=columns, model=model)
122115

123116
index = Index.from_vectors_and_items(vectors=vectors, items=items, backend_type=ann_backend, **kwargs)
@@ -136,7 +129,7 @@ def from_embeddings(
136129
"""
137130
Initialize a SemHash instance from pre-computed embeddings.
138131
139-
This removes exact duplicates and fits a vicinity index using the provided embeddings.
132+
Removes exact duplicates, featurizes the records, and fits a vicinity index.
140133
141134
:param embeddings: Pre-computed embeddings as a numpy array of shape (n_records, embedding_dim).
142135
:param records: A list of records (strings or dictionaries) corresponding to the embeddings.

semhash/utils.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,7 @@ def prepare_records(
280280

281281

282282
def _validate_dataset(dataset: DatasetLike, columns: Sequence[str]) -> tuple[dict[str, Sequence[Any]], int]:
283-
"""Validate dataset structure and extract columns. Returns (cols, n)."""
283+
"""Validate dataset structure and extract columns.."""
284284
try:
285285
column_names = dataset.column_names
286286
except AttributeError as e:
@@ -309,11 +309,6 @@ def prepare_dataset_records(
309309
"""
310310
Extract, validate, and exact-deduplicate dataset rows using columnar access.
311311
312-
Supports HuggingFace datasets.Dataset and any dataset-like object that provides:
313-
- column_names: Sequence[str]
314-
- __len__() -> int
315-
- __getitem__(column_name: str) -> Sequence[Any] (columnar access)
316-
317312
:param dataset: A dataset-like object with columnar access.
318313
:param columns: Columns to use for deduplication.
319314
:return: Tuple of (deduplicated_records, items, was_string) where:

0 commit comments

Comments
 (0)