diff --git a/Cargo.toml b/Cargo.toml index 6a18dd6d..0dbfc56f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,7 +20,6 @@ enum_primitive = "0.1.0" byteorder = "0.4" num = "0.1" lazy_static = "0.1.15" -linear-map = "0.0.4" clippy = { version = "0.0.27", optional = true } diff --git a/src/tensor.rs b/src/tensor.rs index 37fedd8a..7b86b6d8 100644 --- a/src/tensor.rs +++ b/src/tensor.rs @@ -47,23 +47,43 @@ //! # } //! ``` -use linear_map::LinearMap; use device::{IDevice, DeviceType}; use memory::MemoryType; +use std::cell::{Cell, RefCell}; use std::marker::PhantomData; use std::{fmt, mem, error}; +use std::error::Error as StdError; /// Describes the Descriptor of a SharedTensor. pub type TensorDesc = Vec; -#[derive(Debug)] +/// BitMap type for keeping track of up-to-date locations. If number of +/// locations provided by the integer isn't enough, this type can be easily +/// replaced with BitSet at cost of a heap allocation and extra inderection +/// on access. +type BitMap = u64; + +/// Number of bits in `BitMap`. It's currently no possible to get this +/// information from `BitMap` cleanly. Though there are plans to add a +/// static method or associated constant. +const BIT_MAP_SIZE: usize = 64; + +struct TensorLocation { + device: DeviceType, + + // Box is required to keep references to MemoryType alive if + // SharedTensor::locations vec reallocates storage and moves elements. + // See also comment on `unsafe` near `SharedTensor::read()` impl. + mem: Box, +} + /// Container that handles synchronization of [Memory][1] of type `T`. /// [1]: ../memory/index.html pub struct SharedTensor { desc: TensorDesc, - latest_location: DeviceType, - latest_copy: MemoryType, - copies: LinearMap, + locations: RefCell>, + up_to_date: Cell, + phantom: PhantomData, } @@ -236,18 +256,21 @@ impl ITensorDesc for TensorDesc { } } + +impl fmt::Debug for SharedTensor { + fn fmt(&self, f: &mut ::std::fmt::Formatter) -> ::std::fmt::Result { + write!(f, "SharedTensor desc={:?}", self.desc) + } +} + impl SharedTensor { /// Create new Tensor by allocating [Memory][1] on a Device. /// [1]: ../memory/index.html - pub fn new(dev: &DeviceType, desc: &D) -> Result, Error> { - let copies = LinearMap::::new(); - let copy = try!(Self::alloc_on_device(dev, desc)); - let tensor_desc: TensorDesc = desc.into(); + pub fn new(desc: &D) -> Result, Error> { Ok(SharedTensor { - desc: tensor_desc, - latest_location: dev.clone(), - latest_copy: copy, - copies: copies, + desc: desc.into(), + locations: RefCell::new(Vec::new()), + up_to_date: Cell::new(0), phantom: PhantomData, }) } @@ -273,149 +296,183 @@ impl SharedTensor { /// 'reshape' is preffered over this method if the size of the old and new shape /// are identical because it will not reallocate memory. pub fn resize(&mut self, desc: &D) -> Result<(), Error> { - self.copies.clear(); - self.latest_copy = try!(Self::alloc_on_device(self.latest_device(), desc)); - let new_desc: TensorDesc = desc.into(); - self.desc = new_desc; + self.locations.borrow_mut().clear(); + self.up_to_date.set(0); + self.desc = desc.into(); Ok(()) } - /// Allocate memory on the provided DeviceType. - fn alloc_on_device(dev: &DeviceType, desc: &D) -> Result { - let tensor_desc: TensorDesc = desc.into(); - let alloc_size = Self::mem_size(tensor_desc.size()); - let copy = match *dev { - #[cfg(feature = "native")] - DeviceType::Native(ref cpu) => MemoryType::Native(try!(cpu.alloc_memory(alloc_size))), - #[cfg(feature = "opencl")] - DeviceType::OpenCL(ref context) => MemoryType::OpenCL(try!(context.alloc_memory(alloc_size))), - #[cfg(feature = "cuda")] - DeviceType::Cuda(ref context) => MemoryType::Cuda(try!(context.alloc_memory(alloc_size))), - }; - Ok(copy) + fn get_location_index(&self, device: &DeviceType) -> Option { + for (i, loc) in self.locations.borrow().iter().enumerate() { + if loc.device == *device { + return Some(i); + } + } + None } - /// Synchronize memory from latest location to `destination`. - pub fn sync(&mut self, destination: &DeviceType) -> Result<(), Error> { - if &self.latest_location != destination { - let latest = self.latest_location.clone(); - try!(self.sync_from_to(&latest, &destination)); - - let mut swap_location = destination.clone(); - let mut swap_copy = try!(self.copies.remove(destination).ok_or(Error::MissingDestination("Tensor does not hold a copy on destination device."))); - mem::swap(&mut self.latest_location, &mut swap_location); - mem::swap(&mut self.latest_copy, &mut swap_copy); - self.copies.insert(swap_location, swap_copy); + /// Looks up `device` in self.locations and returns its index. If lookup + /// fails then new location is created and its index is returned. + fn get_or_create_location_index(&self, device: &DeviceType) + -> Result { + if let Some(i) = self.get_location_index(device) { + return Ok(i); } - Ok(()) - } - /// Get a reference to the memory copy on the provided `device`. - /// - /// Returns `None` if there is no memory copy on the device. - pub fn get(&self, device: &DeviceType) -> Option<&MemoryType> { - // first check if device is not current location. This is cheaper than a lookup in `copies`. - if &self.latest_location == device { - return Some(&self.latest_copy) + if self.locations.borrow().len() == BIT_MAP_SIZE { + return Err(Error::CapacityExceeded); } - self.copies.get(device) - } - /// Get a mutable reference to the memory copy on the provided `device`. - /// - /// Returns `None` if there is no memory copy on the device. - pub fn get_mut(&mut self, device: &DeviceType) -> Option<&mut MemoryType> { - // first check if device is not current location. This is cheaper than a lookup in `copies`. - if &self.latest_location == device { - return Some(&mut self.latest_copy) + let mem = try!(Self::alloc_on_device(device, self.desc().size())); + self.locations.borrow_mut().push(TensorLocation { + device: device.clone(), + mem: Box::new(mem), + }); + Ok(self.locations.borrow().len() - 1) + } + + // TODO: chose the best source to copy data from. + // That would require some additional traits that return costs for + // transferring data between different backends. + // Actually I think that there would be only transfers between + // `Native` <-> `Cuda` and `Native` <-> `OpenCL` in foreseeable future, + // so it's best to not overengineer here. + fn sync_if_needed(&self, dst_i: usize) -> Result<(), Error> { + if self.up_to_date.get() & (1 << dst_i) != 0 { + return Ok(()); } - self.copies.get_mut(device) - } - - /// Synchronize memory from `source` device to `destination` device. - fn sync_from_to(&mut self, source: &DeviceType, destination: &DeviceType) -> Result<(), Error> { - if source != destination { - match self.copies.get_mut(destination) { - Some(mut destination_copy) => { - match destination { - #[cfg(feature = "native")] - &DeviceType::Native(ref cpu) => { - match destination_copy.as_mut_native() { - Some(ref mut mem) => try!(cpu.sync_in(&self.latest_location, &self.latest_copy, mem)), - None => return Err(Error::InvalidMemory("Expected Native Memory (FlatBox)")) - } - }, - #[cfg(feature = "cuda")] - &DeviceType::Cuda(ref context) => { - match destination_copy.as_mut_cuda() { - Some(ref mut mem) => try!(context.sync_in(&self.latest_location, &self.latest_copy, mem)), - None => return Err(Error::InvalidMemory("Expected CUDA Memory.")) - } - }, - #[cfg(feature = "opencl")] - &DeviceType::OpenCL(ref context) => { - match destination_copy.as_mut_opencl() { - Some(ref mut mem) => try!(context.sync_in(&self.latest_location, &self.latest_copy, mem)), - None => return Err(Error::InvalidMemory("Expected OpenCL Memory.")) - } - } - } - Ok(()) - }, - None => Err(Error::MissingDestination("Tensor does not hold a copy on destination device.")) - } + + let src_i = self.up_to_date.get().trailing_zeros() as usize; + assert!(src_i != BIT_MAP_SIZE); + + // We need to borrow two different Vec elements: src and mut dst. + // Borrowck doesn't allow to do it in a straightforward way, so + // here is workaround. + assert!(src_i != dst_i); + let mut locs = self.locations.borrow_mut(); + let (src_loc, mut dst_loc) = if src_i < dst_i { + let (left, right) = locs.split_at_mut(dst_i); + (&left[src_i], &mut right[0]) } else { - Ok(()) - } - } + let (left, right) = locs.split_at_mut(src_i); + (&right[0], &mut left[dst_i]) + }; - /// Removes Copy from SharedTensor and therefore aquires ownership over the removed memory copy for synchronizing. - pub fn remove_copy(&mut self, destination: &DeviceType) -> Result<(MemoryType), Error> { - // If `destination` holds the latest data, sync to another memory first, before removing it. - if &self.latest_location == destination { - let first = self.copies.keys().nth(0).unwrap().clone(); - try!(self.sync(&first)); - } - match self.copies.remove(destination) { - Some(destination_cpy) => Ok(destination_cpy), - None => Err(Error::MissingDestination("Tensor does not hold a copy on destination device.")) + match &dst_loc.device { + #[cfg(feature = "native")] + &DeviceType::Native(ref cpu) => { + let mem = dst_loc.mem.as_mut_native() + .expect("Broken invariant: expected Native Memory"); + try!(cpu.sync_in(&src_loc.device, &src_loc.mem, mem)); + }, + #[cfg(feature = "cuda")] + &DeviceType::Cuda(ref context) => { + let mem = dst_loc.mem.as_mut_cuda() + .expect("Broken invariant: expected Cuda Memory"); + try!(context.sync_in(&src_loc.device, &src_loc.mem, mem)); + }, + #[cfg(feature = "opencl")] + &DeviceType::OpenCL(ref context) => { + let mem = dst_loc.mem.as_mut_opencl() + .expect("Broken invariant: expected OpenCL Memory"); + try!(context.sync_in(&src_loc.device, &src_loc.mem, mem)); + } } + + Ok(()) } - /// Return ownership over a memory copy after synchronizing. - fn return_copy(&mut self, dest: &DeviceType, dest_mem: MemoryType) { - self.copies.insert(dest.clone(), dest_mem); + /// Allocate memory on the provided DeviceType for `n` elements. + fn alloc_on_device(dev: &DeviceType, n: usize) -> Result { + let alloc_size = Self::mem_size(n); + let copy = match *dev { + #[cfg(feature = "native")] + DeviceType::Native(ref cpu) => MemoryType::Native(try!(cpu.alloc_memory(alloc_size))), + #[cfg(feature = "opencl")] + DeviceType::OpenCL(ref context) => MemoryType::OpenCL(try!(context.alloc_memory(alloc_size))), + #[cfg(feature = "cuda")] + DeviceType::Cuda(ref context) => MemoryType::Cuda(try!(context.alloc_memory(alloc_size))), + }; + Ok(copy) } - /// Track a new `device` and allocate memory on it. - /// - /// Returns an error if the Tensor is already tracking the `device`. - pub fn add_device(&mut self, device: &DeviceType) -> Result<&mut Self, Error> { - // first check if device is not current location. This is cheaper than a lookup in `copies`. - if &self.latest_location == device { - return Err(Error::InvalidMemoryAllocation("Tensor already tracks memory for this device. No memory allocation.")) + // Functions `read()`, `read_write()`, `write_only()` use `unsafe` to + // extend lifetime of retured reference to internally owned memory chunk. + // Borrowck guarantees that SharedTensor outlives all of its Tensors, and + // there is only one mutable borrow. So we only need to make sure that + // memory locations won't be dropped or moved while there are live Tensors. + // It's quite easy to do: by convention we only allow to remove elements from + // `self.locations` in methods with `&mut self`. Since we store `MemoryType` + // inside `Vec` in a `Box`, reference to it won't change during Vec + // reallocations. + + /// Get memory for reading on the specified `device`. + /// Can fail if memory allocation fails, or if tensor wasn't initialized yet. + pub fn read<'a>(&'a self, device: &DeviceType) -> Result<&'a MemoryType, Error> { + if self.up_to_date.get() == 0 { + return Err(Error::UninitializedMemory); } - match self.copies.get(device) { - Some(_) => Err(Error::InvalidMemoryAllocation("Tensor already tracks memory for this device. No memory allocation.")), - None => { - let copy: MemoryType; - match *device { - #[cfg(feature = "native")] - DeviceType::Native(ref cpu) => copy = MemoryType::Native(try!(cpu.alloc_memory(Self::mem_size(self.capacity())))), - #[cfg(feature = "opencl")] - DeviceType::OpenCL(ref context) => copy = MemoryType::OpenCL(try!(context.alloc_memory(Self::mem_size(self.capacity())))), - #[cfg(feature = "cuda")] - DeviceType::Cuda(ref context) => copy = MemoryType::Cuda(try!(context.alloc_memory(Self::mem_size(self.capacity())))), - }; - self.copies.insert(device.clone(), copy); - Ok(self) - } + let i = try!(self.get_or_create_location_index(device)); + try!(self.sync_if_needed(i)); + self.up_to_date.set(self.up_to_date.get() | (1 << i)); + + let locs = self.locations.borrow(); + let mem: &MemoryType = &locs[i].mem; + let mem_a: &'a MemoryType = unsafe { ::std::mem::transmute(mem) }; + Ok(mem_a) + } + + /// Get memory for reading and writing on the specified `device`. + /// Can fail if memory allocation fails, or if tensor wasn't initialized yet. + pub fn read_write<'a>(&'a mut self, device: &DeviceType) + -> Result<&'a mut MemoryType, Error> { + if self.up_to_date.get() == 0 { + return Err(Error::UninitializedMemory); + } + let i = try!(self.get_or_create_location_index(device)); + try!(self.sync_if_needed(i)); + self.up_to_date.set(1 << i); + + let mut locs = self.locations.borrow_mut(); + let mem: &mut MemoryType = &mut locs[i].mem; + let mem_a: &'a mut MemoryType = unsafe { ::std::mem::transmute(mem) }; + Ok(mem_a) + } + + /// Get memory for writing only. + /// This function skips synchronization and initialization checks, since + /// contents will be overwritten anyway. By convention caller must fully + /// initialize returned memory. Failure to do so may result in use of + /// uninitialized data later. If caller has failed to overwrite memory, + /// for some reason, it must call `invalidate()` to return vector to + /// uninitialized state. + pub fn write_only<'a>(&'a mut self, device: &DeviceType) + -> Result<&'a mut MemoryType, Error> { + let i = try!(self.get_or_create_location_index(device)); + self.up_to_date.set(1 << i); + + let mut locs = self.locations.borrow_mut(); + let mem: &mut MemoryType = &mut locs[i].mem; + let mem_a: &'a mut MemoryType = unsafe { ::std::mem::transmute(mem) }; + Ok(mem_a) + } + + /// Drops memory allocation on specified device. Returns error + pub fn drop_at(&mut self, device: &DeviceType) -> Result<(), Error> { + match self.get_location_index(device) { + Some(i) => { + self.locations.borrow_mut().remove(i); + + let up_to_date = self.up_to_date.get(); + let mask = (1 << i) - 1; + let lower = up_to_date & mask; + let upper = (up_to_date >> 1) & (!mask); + self.up_to_date.set(lower | upper); + Ok(()) + }, + None => + Err(Error::InvalidMemory("Memory isn't allocated on this device")) } - } - - /// Returns the device that contains the up-to-date memory copy. - pub fn latest_device(&self) -> &DeviceType { - &self.latest_location } /// Returns the number of elements for which the Tensor has been allocated. @@ -452,21 +509,16 @@ pub enum Error { /// Framework error at memory synchronization. MemorySynchronizationError(::device::Error), /// Shape provided for reshaping is not compatible with old shape. - InvalidShape(&'static str) + InvalidShape(&'static str), + /// Maximal number of backing memories has been reached. + CapacityExceeded, + /// Memory is requested for reading, but it hasn't been initialized. + UninitializedMemory, } impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match *self { - Error::MissingSource(ref err) => write!(f, "{:?}", err), - Error::MissingDestination(ref err) => write!(f, "{:?}", err), - Error::InvalidMemory(ref err) => write!(f, "{:?}", err), - Error::InvalidMemoryAllocation(ref err) => write!(f, "{:?}", err), - Error::InvalidRemove(ref err) => write!(f, "{:?}", err), - Error::MemoryAllocationError(ref err) => write!(f, "{}", err), - Error::MemorySynchronizationError(ref err) => write!(f, "{}", err), - Error::InvalidShape(ref err) => write!(f, "{}", err), - } + write!(f, "{}", self.description()) } } @@ -481,6 +533,10 @@ impl error::Error for Error { Error::MemoryAllocationError(ref err) => err.description(), Error::MemorySynchronizationError(ref err) => err.description(), Error::InvalidShape(ref err) => err, + Error::CapacityExceeded => + "Max number of backing memories has been reached", + Error::UninitializedMemory => + "Uninitialized memory is requested for reading", } } @@ -494,6 +550,8 @@ impl error::Error for Error { Error::MemoryAllocationError(ref err) => Some(err), Error::MemorySynchronizationError(ref err) => Some(err), Error::InvalidShape(_) => None, + Error::CapacityExceeded => None, + Error::UninitializedMemory => None, } } }