Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions diskann-benchmark/src/inputs/async_.rs
Original file line number Diff line number Diff line change
Expand Up @@ -384,8 +384,8 @@ impl IndexLoad {

let index_config = IndexConfiguration::new(
self.distance.into(),
metadata.ndims,
metadata.npoints,
metadata.ndims(),
metadata.npoints(),
num_frozen_pts,
1,
config,
Expand Down
9 changes: 4 additions & 5 deletions diskann-benchmark/src/utils/datafiles.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,11 @@ pub(crate) fn load_dataset<T>(path: BinFile<'_>) -> anyhow::Result<Matrix<T>>
where
T: Copy + bytemuck::Pod,
{
let (data, num_data, data_dim) = diskann_providers::utils::file_util::load_bin::<T, _>(
&diskann_providers::storage::FileStorageProvider,
&path.0.to_string_lossy(),
0,
let data = diskann_utils::io::read_bin::<T>(
&mut diskann_providers::storage::FileStorageProvider
.open_reader(&path.0.to_string_lossy())?,
)?;
Ok(Matrix::try_from(data.into(), num_data, data_dim).map_err(|err| err.as_static())?)
Ok(data)
}

/// Helper trait to load a `Matrix<Self>` from source files that potentially have a different
Expand Down
21 changes: 10 additions & 11 deletions diskann-disk/src/build/builder/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,12 @@ use diskann_providers::{
},
storage::{AsyncIndexMetadata, DiskGraphOnly, PQStorage},
utils::{
create_thread_pool, find_medoid_with_sampling, load_bin, save_bin_u32, RayonThreadPool,
VectorDataIterator, MAX_MEDOID_SAMPLE_SIZE,
create_thread_pool, find_medoid_with_sampling, RayonThreadPool, VectorDataIterator,
MAX_MEDOID_SAMPLE_SIZE,
},
};
use diskann_utils::io::{read_bin, write_bin};
use diskann_utils::views::MatrixView;
use tokio::task::JoinSet;
use tracing::{debug, info};

Expand Down Expand Up @@ -880,9 +882,9 @@ impl StartPoint {
path
)));
}
let (data, _, _) = load_bin::<u32, _>(&mut reader.open_reader(path)?, 0)?;
let data = read_bin::<u32>(&mut reader.open_reader(path)?)?;

let start_point_id = data.first().ok_or_else(|| {
let start_point_id = data.try_get(0, 0).ok_or_else(|| {
ANNError::log_invalid_file_format(format!("Start point ID file {} is empty", path))
})?;

Expand All @@ -894,12 +896,9 @@ impl StartPoint {
where
StorageWriter: StorageWriteProvider,
{
save_bin_u32(
write_bin(
MatrixView::row_vector(std::slice::from_ref(&self.0)),
&mut storage_provider.create_for_write(path)?,
std::slice::from_ref(&self.0),
1,
1,
0,
)?;
debug!("Saved start point ID {} to {}", self.0, path);
Ok(())
Expand All @@ -915,7 +914,7 @@ mod start_point_tests {
use std::io::Write;

use diskann_providers::storage::VirtualStorageProvider;
use diskann_providers::utils::write_metadata;
use diskann_utils::io::Metadata;

use super::*;

Expand Down Expand Up @@ -976,7 +975,7 @@ mod start_point_tests {
let mut file = storage_provider.create_for_write(file_path).unwrap();
let npts = 0;
let dim = 1;
write_metadata(&mut file, npts, dim).unwrap();
Metadata::new(npts, dim).unwrap().write(&mut file).unwrap();
}

let result = StartPoint::load(file_path, &storage_provider);
Expand Down
44 changes: 19 additions & 25 deletions diskann-disk/src/build/builder/core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@ use diskann_providers::{
},
storage::PQStorage,
utils::{
load_bin, load_metadata_from_file, RayonThreadPool, SampleVectorReader, SamplingDensity,
load_metadata_from_file, RayonThreadPool, SampleVectorReader, SamplingDensity,
READ_WRITE_BLOCK_SIZE,
},
};
use diskann_utils::io::read_bin;
use rand::{seq::SliceRandom, Rng};
use tracing::info;

Expand Down Expand Up @@ -129,7 +130,7 @@ where
let metadata = load_metadata_from_file(storage_provider, shard_base_file)?;

let mut index_config = base_config.clone();
index_config.max_points = metadata.npoints;
index_config.max_points = metadata.npoints();
index_config.config = low_degree_params;

Ok(index_config)
Expand All @@ -145,13 +146,11 @@ where
T: Default + bytemuck::Pod,
{
let storage_provider = self.storage_provider;
let (shard_ids, shard_size, _) = load_bin::<u32, StorageProvider::Reader>(
&mut storage_provider.open_reader(shard_ids_file)?,
0,
)?;
let shard_ids = read_bin::<u32>(&mut storage_provider.open_reader(shard_ids_file)?)?;
let shard_size = shard_ids.nrows();
info!("Loaded {} shard ids from {}", shard_size, shard_ids_file);
let max_id = shard_ids.iter().max().copied().unwrap_or(0);
let sampling_rate = shard_ids.len() as f64 / (max_id + 1) as f64;
let max_id = shard_ids.as_slice().iter().max().copied().unwrap_or(0);
let sampling_rate = shard_ids.as_slice().len() as f64 / (max_id + 1) as f64;

let mut dataset_reader: SampleVectorReader<T, _> = SampleVectorReader::new(
dataset_file,
Expand All @@ -172,7 +171,7 @@ where
shard_base_cached_writer.write(&dim.to_le_bytes())?;

let mut num_written: u32 = 0;
dataset_reader.read_vectors(shard_ids.iter().copied(), |vector_t| {
dataset_reader.read_vectors(shard_ids.as_slice().iter().copied(), |vector_t| {
// Casting Pod type to bytes always succeeds (u8 has alignment of 1)
let vector_bytes: &[u8] = bytemuck::must_cast_slice(vector_t);
shard_base_cached_writer.write(vector_bytes)?;
Expand Down Expand Up @@ -384,12 +383,9 @@ where
Ok(())
}

fn read_idmap(&self, idmaps_path: String) -> std::io::Result<Vec<u32>> {
let (data, _npts, _dim) = load_bin::<u32, StorageProvider::Reader>(
&mut self.storage_provider.open_reader(&idmaps_path)?,
0,
)?;
Ok(data)
fn read_idmap(&self, idmaps_path: String) -> Result<Vec<u32>, diskann_utils::io::ReadBinError> {
let data = read_bin::<u32>(&mut self.storage_provider.open_reader(&idmaps_path)?)?;
Ok(data.into_inner().into_vec())
}

fn merge_shards_and_cleanup(
Expand Down Expand Up @@ -644,7 +640,7 @@ pub(crate) mod disk_index_builder_tests {
test_utils::graph_data_type_utils::{
GraphDataF32VectorU32Data, GraphDataF32VectorUnitData,
},
utils::{file_util, BridgeErr, Timer},
utils::Timer,
};
use diskann_utils::test_data_root;
use diskann_vector::{
Expand Down Expand Up @@ -787,15 +783,17 @@ pub(crate) mod disk_index_builder_tests {
.unwrap();

assert_eq!(
self.params.dim, metadata.ndims,
self.params.dim,
metadata.ndims(),
"Parameters dimension {} and data dimension {} are not equal",
self.params.dim, metadata.ndims
self.params.dim,
metadata.ndims(),
);

let config = IndexConfiguration::new(
self.params.metric,
self.params.dim,
metadata.npoints,
metadata.npoints(),
ONE,
self.params.num_threads,
config,
Expand Down Expand Up @@ -1067,13 +1065,9 @@ pub(crate) mod disk_index_builder_tests {
None,
)?;

let (data, npoints, dim) = file_util::load_bin::<G::VectorDataType, _>(
storage_provider.as_ref(),
&params.data_path,
0,
)?;
let data =
diskann_utils::views::Matrix::try_from(data.into(), npoints, dim).bridge_err()?;
read_bin::<G::VectorDataType>(&mut storage_provider.open_reader(&params.data_path)?)?;
let dim = data.ncols();
let distance = <G::VectorDataType>::distance(params.metric, Some(dim));

// Here, we use elements of the dataset to search the dataset itself.
Expand Down
19 changes: 8 additions & 11 deletions diskann-disk/src/search/provider/disk_provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1059,11 +1059,9 @@ mod disk_provider_tests {
test_utils::graph_data_type_utils::{
GraphDataF32VectorU32Data, GraphDataF32VectorUnitData,
},
utils::{
create_thread_pool, file_util, load_aligned_bin, PQPathNames, ParallelIteratorInPool,
},
utils::{create_thread_pool, load_aligned_bin, PQPathNames, ParallelIteratorInPool},
};
use diskann_utils::test_data_root;
use diskann_utils::{io::read_bin, test_data_root};
use diskann_vector::distance::Metric;
use rayon::prelude::{IndexedParallelIterator, IntoParallelRefIterator};
use rstest::rstest;
Expand Down Expand Up @@ -1189,10 +1187,10 @@ mod disk_provider_tests {
) -> Vec<u32> {
const ASSOCIATED_DATA_FILE: &str = "/sift/siftsmall_learn_256pts_u32_associated_data.fbin";

let (data, _npts, _dim) =
file_util::load_bin::<u32, StorageReader>(storage_provider, ASSOCIATED_DATA_FILE, 0)
let data =
read_bin::<u32>(&mut storage_provider.open_reader(ASSOCIATED_DATA_FILE).unwrap())
.unwrap();
data
data.into_inner().into_vec()
}

#[test]
Expand Down Expand Up @@ -1339,10 +1337,9 @@ mod disk_provider_tests {
storage_provider: &StorageReader,
query_result_path: &str,
) -> Vec<u32> {
let (result, _, _) =
file_util::load_bin::<u32, StorageReader>(storage_provider, query_result_path, 0)
.unwrap();
result
let result =
read_bin::<u32>(&mut storage_provider.open_reader(query_result_path).unwrap()).unwrap();
result.into_inner().into_vec()
}

struct TestDiskSearchParams<'a, StorageType> {
Expand Down
6 changes: 3 additions & 3 deletions diskann-disk/src/search/provider/disk_vertex_provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ mod disk_vertex_provider_tests {
let metadata = load_metadata_from_file(storage_provider, data_path).unwrap();

let memory_budget = MemoryBudget::try_from_gb(1.0).unwrap();
let num_pq_chunks = NumPQChunks::new_with(128, metadata.ndims).unwrap();
let num_pq_chunks = NumPQChunks::new_with(128, metadata.ndims()).unwrap();

let disk_index_build_parameters =
DiskIndexBuildParameters::new(memory_budget, QuantizationType::FP, num_pq_chunks);
Expand All @@ -326,8 +326,8 @@ mod disk_vertex_provider_tests {

let config = IndexConfiguration::new(
diskann_vector::distance::Metric::L2,
metadata.ndims,
metadata.npoints,
metadata.ndims(),
metadata.npoints(),
ONE,
1,
config,
Expand Down
6 changes: 3 additions & 3 deletions diskann-disk/src/storage/disk_index_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,20 +40,20 @@ impl<VectorType> DiskIndexReader<VectorType> {

let pq_compressed_data = PQStorage::load_pq_compressed_vectors_bin::<Storage>(
&pq_compressed_data_path,
metadata.npoints,
metadata.npoints(),
pq_pivot_table.get_num_chunks(),
storage_provider,
)?;
info!(
"Loaded PQ centroids and in-memory compressed vectors. #points:{} #pq_chunks: {}",
metadata.npoints,
metadata.npoints(),
pq_pivot_table.get_num_chunks()
);

Ok(DiskIndexReader {
phantom: PhantomData,
pq_data: Arc::<PQData>::new(PQData::new(pq_pivot_table, pq_compressed_data)?),
num_points: metadata.npoints,
num_points: metadata.npoints(),
})
}

Expand Down
40 changes: 19 additions & 21 deletions diskann-disk/src/storage/quant/generator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,9 @@ use diskann::{error::IntoANNResult, utils::VectorRepr, ANNError, ANNResult};
use diskann_providers::storage::{StorageReadProvider, StorageWriteProvider};
use diskann_providers::{
forward_threadpool,
utils::{
load_metadata_from_file, write_metadata, AsThreadPool, BridgeErr, ParallelIteratorInPool,
Timer,
},
utils::{load_metadata_from_file, AsThreadPool, BridgeErr, ParallelIteratorInPool, Timer},
};
use diskann_utils::views::{self};
use diskann_utils::{io::Metadata, views};
use rayon::iter::IndexedParallelIterator;
use tracing::info;

Expand Down Expand Up @@ -115,7 +112,7 @@ where
let timer = Timer::new();

let metadata = load_metadata_from_file(storage_provider, &self.data_path)?;
let (num_points, dim) = (metadata.npoints, metadata.ndims);
let (num_points, dim) = metadata.into_dims();

self.validate_params(num_points, storage_provider)?;

Expand All @@ -135,8 +132,8 @@ where
storage_provider.open_writer(compressed_path)?
} else {
let mut sp = storage_provider.create_for_write(compressed_path)?;
// write meatadata to header
write_metadata(&mut sp, num_points, self.quantizer.compressed_bytes())?;
// write metadata to header
Metadata::new(num_points, self.quantizer.compressed_bytes())?.write(&mut sp)?;
sp
};

Expand Down Expand Up @@ -279,8 +276,10 @@ mod generator_tests {

use diskann::utils::read_exact_into;
use diskann_providers::storage::VirtualStorageProvider;
use diskann_providers::utils::{
create_thread_pool_for_test, read_metadata, save_bin_f32, save_bytes,
use diskann_providers::utils::{create_thread_pool_for_test, save_bytes};
use diskann_utils::{
io::{write_bin, Metadata},
views::MatrixView,
};
use rstest::rstest;
use vfs::{FileSystem, MemoryFS};
Expand Down Expand Up @@ -384,12 +383,11 @@ mod generator_tests {
let compressed_path = "/test_data/test_compressed.bin".to_string();

// Setup test data
let _ = save_bin_f32(
let data = create_test_data(num_points, dim);
let view = MatrixView::try_from(data.as_slice(), num_points, dim).unwrap();
write_bin(
view,
&mut storage_provider.create_for_write(data_path.as_str())?,
&create_test_data(num_points, dim),
num_points,
dim,
0,
)?;

if offset > 0 {
Expand Down Expand Up @@ -483,13 +481,13 @@ mod generator_tests {

let mut r = storage_provider.open_reader(compressed_path.as_str())?;
let mut reader = BufReader::new(&mut r);
let metadata = read_metadata(&mut reader)?;
let metadata = Metadata::read(&mut reader)?;

let data: Vec<u8> = read_exact_into(&mut reader, expected_size)?;

// Check header
assert_eq!(metadata.ndims as u32, output_dim);
assert_eq!(metadata.npoints, num_points);
assert_eq!(metadata.ndims_u32(), output_dim);
assert_eq!(metadata.npoints(), num_points);

// Check compressed data content
data.chunks_exact(output_dim as usize)
Expand Down Expand Up @@ -542,14 +540,14 @@ mod generator_tests {

let mut r = storage_provider.open_reader(compressed_path.as_str())?;
let mut reader = BufReader::new(&mut r);
let metadata = read_metadata(&mut reader)?;
let metadata = Metadata::read(&mut reader)?;

let data: Vec<u8> =
read_exact_into(&mut reader, expected_size - 2 * std::mem::size_of::<i32>())?;

// Check header
assert_eq!(metadata.ndims as u32, output_dim);
assert_eq!(metadata.npoints, num_points);
assert_eq!(metadata.ndims_u32(), output_dim);
assert_eq!(metadata.npoints(), num_points);

// Check compressed data content
data.chunks_exact(output_dim as usize)
Expand Down
Loading
Loading