Atlas Duplicate Clustering
Atlas groups your data into semantically similar duplicate clusters powered by latent information contained in your embeddings.
Under the hood, Atlas utilizes an algorithm similar to SemDeDup.
You can access and operate on semantic duplicate clusters programmatically by using the duplicates
attribute
of an AtlasMap. Make sure to enable duplicate clustering by setting detect_duplicate = True
when building a map.
from nomic import AtlasProject
map = AtlasProject(name='My Project').maps[0]
map.duplicates
AtlasMapDuplicates
Atlas Duplicate Clusters State
Atlas automatically groups embeddings that are sufficiently close into semantic clusters.
You can use these clusters for semantic duplicate detection allowing you to quickly deduplicate
your data.
Source code in nomic/data_operations.py
| class AtlasMapDuplicates:
"""
Atlas Duplicate Clusters State
Atlas automatically groups embeddings that are sufficiently close into semantic clusters.
You can use these clusters for semantic duplicate detection allowing you to quickly deduplicate
your data.
=== "Accessing Duplicates Example"
``` py
from nomic import AtlasProject
project = AtlasProject(name='My Project')
map = project.maps[0]
print(map.duplicates)
```
=== "Output"
```
460 deletion candidates in 9540 clusters
id_ duplicate_class cluster_id
0 0A singleton 5178
1 0g retention candidate 271
2 0Q singleton 6672
3 0w singleton 7529
4 1A singleton 1587
... ... ... ...
9999 JZU singleton 6346
```
"""
def __init__(self, projection: "AtlasProjection"):
self.projection = projection
self.id_field = self.projection.project.id_field
try:
self._tb: pa.Table = projection._fetch_tiles().select([self.id_field, '_duplicate_class', '_cluster_id'])
except pa.lib.ArrowInvalid as e:
raise ValueError("Duplicate detection has not yet been run on this map.")
self._tb = self._tb.rename_columns([self.id_field, 'duplicate_class', 'cluster_id'])
@property
def df(self) -> pd.DataFrame:
"""
Pandas dataframe mapping each data point to its cluster of semantically similar points
=== "Accessing Duplicates Example"
``` py
from nomic import AtlasProject
project = AtlasProject(name='My Project')
map = project.maps[0]
print(map.duplicates.df)
```
=== "Output"
```
id_ _duplicate_class _cluster_id
0 0A singleton 5178
1 0g retention candidate 271
2 0Q singleton 6672
3 0w singleton 7529
4 1A singleton 1587
... ... ... ...
9999 JZU singleton 6346
```
"""
return self.tb.to_pandas()
@property
def tb(self) -> pa.Table:
"""
Pyarrow table with information about duplicate clusters and candidates.
This table is memmapped from the underlying files and is the most efficient way to
access duplicate information.
"""
return self._tb
def deletion_candidates(self) -> List[str]:
"""
Returns:
The ids for all data points which are semantic duplicates and are candidates for being deleted from the dataset. If you remove these data points from your dataset, your dataset will be semantically deduplicated.
"""
dupes = self.tb[self.id_field].filter(pc.equal(self.tb['duplicate_class'], 'deletion candidate'))
return dupes.to_pylist()
def __repr__(self) -> str:
repr = f"===Atlas Duplicates for ({self.projection})===\n"
duplicate_count = len(
self.tb[self.id_field].filter(pc.equal(self.tb['duplicate_class'], 'deletion candidate'))
)
cluster_count = len(self.tb['cluster_id'].value_counts())
repr += f"{duplicate_count} deletion candidates in {cluster_count} clusters\n"
return repr + self.df.__repr__()
|
df: pd.DataFrame
property
Pandas dataframe mapping each data point to its cluster of semantically similar points
tb: pa.Table
property
Pyarrow table with information about duplicate clusters and candidates.
This table is memmapped from the underlying files and is the most efficient way to
access duplicate information.
deletion_candidates()
Returns:
-
List[str]
–
The ids for all data points which are semantic duplicates and are candidates for being deleted from the dataset. If you remove these data points from your dataset, your dataset will be semantically deduplicated.
Source code in nomic/data_operations.py
| def deletion_candidates(self) -> List[str]:
"""
Returns:
The ids for all data points which are semantic duplicates and are candidates for being deleted from the dataset. If you remove these data points from your dataset, your dataset will be semantically deduplicated.
"""
dupes = self.tb[self.id_field].filter(pc.equal(self.tb['duplicate_class'], 'deletion candidate'))
return dupes.to_pylist()
|