Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check filesystem for existing corpora on creation, import and deletion #313

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 36 additions & 26 deletions graphannis/src/annis/db/corpusstorage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -991,17 +991,23 @@ impl CorpusStorage {
check_cache_size_and_remove_with_cache(cache, &self.cache_strategy, vec![], false)?;

// remove any possible old corpus
if cache.contains_key(&corpus_name) {
if overwrite_existing {
let old_entry = cache.remove(&corpus_name);
if old_entry.is_some() {
if let Err(e) = std::fs::remove_dir_all(db_path.clone()) {
error!("Error when removing existing files {}", e);
}
if overwrite_existing {
let old_entry = cache.remove(&corpus_name);

// if there is a cache entry, acquire an exclusive lock for it because
// other queries or background writers might still have access to it and need to finish first
let _lock = old_entry
.as_ref()
.map(|db_entry| db_entry.write())
.transpose()?;

if db_path.is_dir() {
if let Err(e) = std::fs::remove_dir_all(&db_path) {
error!("Error when removing existing files {}", e);
}
} else {
return Err(GraphAnnisError::CorpusExists(corpus_name.to_string()));
}
} else if cache.contains_key(&corpus_name) || db_path.is_dir() {
return Err(GraphAnnisError::CorpusExists(corpus_name.to_string()));
}

if let Err(e) = std::fs::create_dir_all(&db_path) {
Expand Down Expand Up @@ -1355,24 +1361,27 @@ impl CorpusStorage {
let db_path = self.corpus_directory_on_disk(corpus_name);

let mut cache_lock = self.corpus_cache.write()?;

let cache = &mut *cache_lock;

// remove any possible old corpus
if let Some(db_entry) = cache.remove(corpus_name) {
// aquire exclusive lock for this cache entry because
// other queries or background writer might still have access it and need to finish first
let mut _lock = db_entry.write()?;

if db_path.is_dir() && db_path.exists() {
std::fs::remove_dir_all(db_path).map_err(|e| {
CorpusStorageError::RemoveFileForCorpus {
corpus: corpus_name.to_string(),
source: e,
}
})?
}
let db_entry = cache.remove(corpus_name);

// if there is a cache entry, acquire an exclusive lock for it because
// other queries or background writers might still have access to it and need to finish first
let _lock = db_entry
.as_ref()
.map(|db_entry| db_entry.write())
.transpose()?;

if db_path.is_dir() {
std::fs::remove_dir_all(db_path).map_err(|e| {
CorpusStorageError::RemoveFileForCorpus {
corpus: corpus_name.to_string(),
source: e,
}
})?;

Ok(true)
} else if db_entry.is_some() {
Ok(true)
} else {
Ok(false)
Expand All @@ -1384,11 +1393,12 @@ impl CorpusStorage {
/// Use [`apply_update`](CorpusStorage::apply_update) to add elements to the corpus. Returns whether a
/// new corpus was created.
pub fn create_empty_corpus(&self, corpus_name: &str, disk_based: bool) -> Result<bool> {
let mut cache_lock = self.corpus_cache.write()?;
let db_path = self.corpus_directory_on_disk(corpus_name);

let mut cache_lock = self.corpus_cache.write()?;
let cache = &mut *cache_lock;

if cache.contains_key(corpus_name) {
if cache.contains_key(corpus_name) || db_path.is_dir() {
Ok(false)
} else {
self.load_entry_with_lock(&mut cache_lock, corpus_name, true, disk_based)?;
Expand Down
207 changes: 205 additions & 2 deletions graphannis/src/annis/db/corpusstorage/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ use pretty_assertions::assert_eq;
use super::SearchQuery;

#[test]
fn delete() {
fn delete_existing_cached_corpus() {
let tmp = tempfile::tempdir().unwrap();
let cs = CorpusStorage::with_auto_cache_size(tmp.path(), false).unwrap();
// fully load a corpus
Expand All @@ -38,7 +38,81 @@ fn delete() {

cs.apply_update("testcorpus", &mut g).unwrap();
cs.preload("testcorpus").unwrap();
cs.delete("testcorpus").unwrap();

let deleted = cs.delete("testcorpus").unwrap();

assert_eq!(true, deleted);
}

#[test]
fn delete_existing_uncached_corpus() {
let tmp = tempfile::tempdir().unwrap();
{
let cs = CorpusStorage::with_auto_cache_size(tmp.path(), false).unwrap();
let mut g = GraphUpdate::new();
g.add_event(UpdateEvent::AddNode {
node_name: "test".to_string(),
node_type: "node".to_string(),
})
.unwrap();

cs.apply_update("testcorpus", &mut g).unwrap();
}

{
let cs = CorpusStorage::with_auto_cache_size(tmp.path(), false).unwrap();

let deleted = cs.delete("testcorpus").unwrap();

assert_eq!(true, deleted);
}
}

#[test]
fn delete_nonexisting_corpus() {
let tmp = tempfile::tempdir().unwrap();
let cs = CorpusStorage::with_auto_cache_size(tmp.path(), false).unwrap();

let deleted = cs.delete("testcorpus").unwrap();

assert_eq!(false, deleted);
}

#[test]
fn create_empty_corpus_existing_cached() {
let tmp = tempfile::tempdir().unwrap();
let cs = CorpusStorage::with_auto_cache_size(tmp.path(), false).unwrap();
cs.create_empty_corpus("testcorpus", false).unwrap();

let created = cs.create_empty_corpus("testcorpus", false).unwrap();

assert_eq!(false, created);
}

#[test]
fn create_empty_corpus_existing_uncached() {
let tmp = tempfile::tempdir().unwrap();
{
let cs = CorpusStorage::with_auto_cache_size(tmp.path(), false).unwrap();
cs.create_empty_corpus("testcorpus", false).unwrap();
}
{
let cs = CorpusStorage::with_auto_cache_size(tmp.path(), false).unwrap();

let created = cs.create_empty_corpus("testcorpus", false).unwrap();

assert_eq!(false, created);
}
}

#[test]
fn create_empty_corpus_nonexisting() {
let tmp = tempfile::tempdir().unwrap();
let cs = CorpusStorage::with_auto_cache_size(tmp.path(), false).unwrap();

let created = cs.create_empty_corpus("testcorpus", false).unwrap();

assert_eq!(true, created);
}

#[test]
Expand Down Expand Up @@ -1257,6 +1331,135 @@ fn import_relative_corpus_with_linked_file() {
assert_eq!("The content of this file is not important.", file_content);
}

#[test]
fn import_existing_cached_corpus_no_overwrite() {
let tmp = tempfile::tempdir().unwrap();
let cargo_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));

let cs = CorpusStorage::with_auto_cache_size(tmp.path(), true).unwrap();
cs.create_empty_corpus("testcorpus", false).unwrap();
let result = cs.import_from_fs(
&cargo_dir.join("tests/SaltSampleCorpus.graphml"),
ImportFormat::GraphML,
Some("testcorpus".into()),
false,
false,
|_| {},
);

assert!(matches!(result, Err(GraphAnnisError::CorpusExists(_))));
}

#[test]
fn import_existing_uncached_corpus_no_overwrite() {
let tmp = tempfile::tempdir().unwrap();
let cargo_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));

{
let cs = CorpusStorage::with_auto_cache_size(tmp.path(), true).unwrap();
cs.create_empty_corpus("testcorpus", false).unwrap();
}
{
let cs = CorpusStorage::with_auto_cache_size(tmp.path(), true).unwrap();
let result = cs.import_from_fs(
&cargo_dir.join("tests/SaltSampleCorpus.graphml"),
ImportFormat::GraphML,
Some("testcorpus".into()),
false,
false,
|_| {},
);

assert!(matches!(result, Err(GraphAnnisError::CorpusExists(_))));
}
}

#[test]
fn import_existing_cached_corpus_overwrite() {
let tmp = tempfile::tempdir().unwrap();
let cargo_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));

let cs = CorpusStorage::with_auto_cache_size(tmp.path(), true).unwrap();
cs.import_from_fs(
&cargo_dir.join("tests/SegmentationWithGaps.graphml"),
ImportFormat::GraphML,
Some("testcorpus".into()),
false,
false,
|_| {},
)
.unwrap();

let num_ordering_components = cs
.list_components("testcorpus", Some(AnnotationComponentType::Ordering), None)
.unwrap()
.len();
assert_eq!(3, num_ordering_components);

cs.import_from_fs(
&cargo_dir.join("tests/SaltSampleCorpus.graphml"),
ImportFormat::GraphML,
Some("testcorpus".into()),
false,
true,
|_| {},
)
.unwrap();

// Check that the number of ordering components has decreased,
// showing that the new corpus was not just added on top of the old one
let num_ordering_components = cs
.list_components("testcorpus", Some(AnnotationComponentType::Ordering), None)
.unwrap()
.len();
assert_eq!(1, num_ordering_components);
}

#[test]
fn import_existing_uncached_corpus_overwrite() {
let tmp = tempfile::tempdir().unwrap();
let cargo_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));

{
let cs = CorpusStorage::with_auto_cache_size(tmp.path(), true).unwrap();
cs.import_from_fs(
&cargo_dir.join("tests/SegmentationWithGaps.graphml"),
ImportFormat::GraphML,
Some("testcorpus".into()),
false,
false,
|_| {},
)
.unwrap();

let num_ordering_components = cs
.list_components("testcorpus", Some(AnnotationComponentType::Ordering), None)
.unwrap()
.len();
assert_eq!(3, num_ordering_components);
}
{
let cs = CorpusStorage::with_auto_cache_size(tmp.path(), true).unwrap();
cs.import_from_fs(
&cargo_dir.join("tests/SaltSampleCorpus.graphml"),
ImportFormat::GraphML,
Some("testcorpus".into()),
false,
true,
|_| {},
)
.unwrap();

// Check that the number of ordering components has decreased,
// showing that the new corpus was not just added on top of the old one
let num_ordering_components = cs
.list_components("testcorpus", Some(AnnotationComponentType::Ordering), None)
.unwrap()
.len();
assert_eq!(1, num_ordering_components);
}
}

#[test]
#[serial]
fn load_legacy_binary_corpus_1() {
Expand Down