@@ -56,7 +56,7 @@ def from_records(
5656 """
5757 Initialize a SemHash instance from records.
5858
59- This removes exact duplicates, featurizes the records, and fits a vicinity index.
59+ Removes exact duplicates, featurizes the records, and fits a vicinity index.
6060
6161 :param records: A list of records (strings or dictionaries).
6262 :param columns: Columns to featurize if records are dictionaries.
@@ -93,15 +93,8 @@ def from_dataset(
9393 """
9494 Initialize SemHash from a dataset (e.g., HuggingFace Dataset).
9595
96- Extracts records from the dataset, deduplicates them, and embeds only
97- representative records (not duplicates). The encoder controls batching internally.
98-
99- Supports any dataset-like object that provides:
100- - column_names: Sequence[str]
101- - __len__() -> int
102- - __getitem__(column_name: str) -> Sequence[Any] (columnar access)
103-
104- HuggingFace datasets.Dataset satisfies this contract, but custom implementations work too.
96+ Removes exact duplicates, featurizes the records, and fits a vicinity index.
97+ Supports any dataset-like object that follows the DatasetLike protocol.
10598
10699 :param dataset: A dataset-like object with columnar access.
107100 :param columns: Columns to use for deduplication (same as from_records).
@@ -117,7 +110,7 @@ def from_dataset(
117110 # Extract, validate, and deduplicate dataset records
118111 deduplicated_records , items , was_string = prepare_dataset_records (dataset , columns )
119112
120- # Embed representatives only (encoder decides batching internally)
113+ # Create embeddings for deduplicated records only
121114 vectors = featurize (records = deduplicated_records , columns = columns , model = model )
122115
123116 index = Index .from_vectors_and_items (vectors = vectors , items = items , backend_type = ann_backend , ** kwargs )
@@ -136,7 +129,7 @@ def from_embeddings(
136129 """
137130 Initialize a SemHash instance from pre-computed embeddings.
138131
139- This removes exact duplicates and fits a vicinity index using the provided embeddings .
132+ Removes exact duplicates, featurizes the records, and fits a vicinity index.
140133
141134 :param embeddings: Pre-computed embeddings as a numpy array of shape (n_records, embedding_dim).
142135 :param records: A list of records (strings or dictionaries) corresponding to the embeddings.
0 commit comments