Skip to content

Commit

Permalink
[kosha] Add much more metadata
Browse files Browse the repository at this point in the history
- Add Kosha.__len__

- Add DhatuMeta with the following fields:
    - artha_sa
    - artha_en
    - artha_hi
    - karmatva
    - ittva
    - pada

- Add PratipadikaEntry.lingas and BaseKrt.lingas
  • Loading branch information
akprasad committed Jan 21, 2025
1 parent 178cc8f commit 1426587
Show file tree
Hide file tree
Showing 19 changed files with 533 additions and 70 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

45 changes: 38 additions & 7 deletions bindings-python/src/kosha.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ use pyo3::exceptions::{PyKeyError, PyOSError, PyValueError};
use pyo3::prelude::*;
use std::path::PathBuf;
use vidyut_kosha::entries::{
BasicPratipadikaEntry, DhatuEntry, KrdantaEntry, PadaEntry, PratipadikaEntry, SubantaEntry,
TinantaEntry,
BasicPratipadikaEntry, DhatuEntry, DhatuMeta, KrdantaEntry, PadaEntry, PratipadikaEntry,
SubantaEntry, TinantaEntry,
};
use vidyut_kosha::{Builder, Kosha};
use vidyut_prakriya::args as vp;
Expand All @@ -27,9 +27,9 @@ impl PyKosha {
fn new(path: PathBuf) -> PyResult<Self> {
match Kosha::new(path.clone()) {
Ok(kosha) => Ok(Self(kosha)),
Err(_) => Err(PyOSError::new_err(
"Unknown error. The input file might be missing.",
)),
Err(e) => Err(PyOSError::new_err(format!(
"Could not load kosha. Error was: {e:?}"
))),
}
}

Expand All @@ -48,6 +48,10 @@ impl PyKosha {
}
}

fn __len__(&self) -> usize {
self.0.len()
}

fn __repr__(&self) -> String {
String::from("Kosha()")
}
Expand Down Expand Up @@ -131,11 +135,38 @@ pub struct PyBuilder {
#[derive(Default)]
struct SmallRegistry {
lingas: Vec<Vec<vp::Linga>>,
dhatu_meta: Vec<DhatuMeta>,
}

impl SmallRegistry {
fn to_dhatu_entry<'a>(&self, entry: &'a PyDhatuEntry) -> DhatuEntry<'a> {
DhatuEntry::new(entry.dhatu.as_rust(), &entry.clean_text)
fn to_dhatu_entry<'a>(&'a mut self, entry: &'a PyDhatuEntry) -> DhatuEntry<'a> {
let mut builder = DhatuMeta::builder();

builder = builder.clean_text(entry.clean_text.to_string());
if let Some(s) = &entry.artha_sa {
builder = builder.artha_sa(s.to_string());
}
if let Some(s) = &entry.artha_en {
builder = builder.artha_en(s.to_string());
}
if let Some(s) = &entry.artha_hi {
builder = builder.artha_hi(s.to_string());
}
if let Some(s) = &entry.ittva {
builder = builder.ittva(s.to_string());
}
if let Some(s) = &entry.karmatva {
builder = builder.karmatva(s.to_string());
}
if let Some(s) = &entry.pada {
builder = builder.pada(s.to_string());
}

let meta = builder.build().expect("clean_text defined");
self.dhatu_meta.push(meta);

let m = self.dhatu_meta.last().expect("just pushed");
DhatuEntry::new(entry.dhatu.as_rust()).with_meta(m)
}

fn to_pratipadika_entry<'a>(
Expand Down
82 changes: 71 additions & 11 deletions bindings-python/src/kosha/entries.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,13 @@ fn py_repr_string(text: &str) -> String {
}
}

fn py_repr_option_string(option: &Option<String>) -> String {
match option {
Some(text) => py_repr_string(&text),
None => String::from("None"),
}
}

/// A verb root.
#[pyclass(name = "DhatuEntry", get_all, eq, ord)]
#[derive(Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
Expand All @@ -29,30 +36,62 @@ pub struct PyDhatuEntry {
/// - `qukf\\Y` --> `kf`
/// - `vidi~` --> `vind`
pub(crate) clean_text: String,

/// The meaning of this dhatu's *mūla* as an SLP1 string.
///
/// We have meaning strings only for the ~2000 *mūla* dhatus from the Dhatupatha. Any roots
/// derived from these ~2000 will share their `artha` with the dhatu they come from.
///
/// Examples:
///
/// - `BU` --> `sattAyAm`
/// - `aBiBU` --> `sattAyAm`
/// - `aBibuBUza` --> `sattAyAm`
pub(crate) artha_sa: Option<String>,
pub(crate) artha_en: Option<String>,
pub(crate) artha_hi: Option<String>,
pub(crate) karmatva: Option<String>,
pub(crate) ittva: Option<String>,
pub(crate) pada: Option<String>,
}

#[pymethods]
impl PyDhatuEntry {
/// Create a new `DhatuEntry`.
#[new]
#[pyo3(signature = (*, dhatu, clean_text))]
fn new(dhatu: PyDhatu, clean_text: String) -> Self {
Self { dhatu, clean_text }
#[pyo3(signature = (dhatu, clean_text, *, artha_sa = None, artha_en = None, artha_hi = None,
karmatva = None, ittva = None, pada = None))]
fn new(
dhatu: PyDhatu,
clean_text: String,
artha_sa: Option<String>,
artha_en: Option<String>,
artha_hi: Option<String>,
karmatva: Option<String>,
ittva: Option<String>,
pada: Option<String>,
) -> Self {
Self {
dhatu,
clean_text,
artha_sa,
artha_en,
artha_hi,
karmatva,
ittva,
pada,
}
}

fn __repr__(&self) -> String {
format!(
"DhatuEntry(dhatu={}, clean_text={})",
"DhatuEntry(dhatu={}, clean_text={}, artha_sa={})",
self.dhatu.__repr__(),
py_repr_string(&self.clean_text)
py_repr_string(&self.clean_text),
py_repr_option_string(&self.artha_sa),
)
}

#[getter]
fn dhatu(&self) -> PyDhatu {
self.dhatu.clone()
}

/// Convert this entry to a :class:`~vidyut.prakriya.Dhatu`.
pub fn to_prakriya_args(&self) -> PyDhatu {
self.dhatu.clone()
Expand All @@ -64,6 +103,12 @@ impl<'a> From<&DhatuEntry<'a>> for PyDhatuEntry {
Self {
dhatu: val.dhatu().into(),
clean_text: val.clean_text().to_string(),
artha_sa: val.artha_sa().map(|x| x.to_string()),
artha_en: val.artha_en().map(|x| x.to_string()),
artha_hi: val.artha_hi().map(|x| x.to_string()),
ittva: None,
karmatva: None,
pada: None,
}
}
}
Expand Down Expand Up @@ -140,6 +185,21 @@ impl PyPratipadikaEntry {
}
}

/// Returns the lingas that this *prātipadika* is allowed to use.
///
/// If empty, lingas might not yet be implemented for this *prātipadika* type.
#[getter]
pub fn lingas(&self) -> Vec<PyLinga> {
match self {
Self::Basic { lingas, .. } => lingas.clone(),
Self::Krdanta { krt, .. } => vp::BaseKrt::from(*krt)
.lingas()
.iter()
.map(|x| (*x).into())
.collect(),
}
}

#[getter]
pub fn is_avyaya(&self) -> bool {
match self {
Expand All @@ -158,7 +218,7 @@ impl PyPratipadikaEntry {
krt,
prayoga: _,
lakara: _,
} => PyPratipadika::krdanta(dhatu_entry.dhatu().clone(), krt.clone()),
} => PyPratipadika::krdanta(dhatu_entry.dhatu.clone(), krt.clone()),
}
}
}
Expand Down
14 changes: 8 additions & 6 deletions bindings-python/test/unit/kosha/test_entries.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,11 @@

def test_dhatu_entry():
gam = Dhatu.mula("ga\\mx~", Gana.Bhvadi)
entry = DhatuEntry(dhatu=gam, clean_text="gam")
entry = DhatuEntry(dhatu=gam, clean_text="gam", artha_sa="gatO")

assert entry.dhatu == gam
assert entry.clean_text == "gam"
assert entry.artha_sa == "gatO"

# Nested attributes
assert entry.dhatu.aupadeshika == "ga\\mx~"
Expand All @@ -32,10 +33,10 @@ def test_dhatu_entry():

def test_dhatu_entry__dunders():
gam = Dhatu.mula("ga\\mx~", Gana.Bhvadi)
entry_gam = DhatuEntry(dhatu=gam, clean_text="gam")
entry_gam = DhatuEntry(dhatu=gam, clean_text="gam", artha_sa="gatO")

bhu = Dhatu.mula("BU", Gana.Bhvadi)
entry_bhu = DhatuEntry(dhatu=bhu, clean_text="BU")
entry_bhu = DhatuEntry(dhatu=bhu, clean_text="BU", artha_sa="sattAyAm")

# __eq__, __ne__
assert entry_gam == entry_gam
Expand All @@ -47,7 +48,7 @@ def test_dhatu_entry__dunders():
# __repr__
assert repr(entry_gam) == (
"DhatuEntry(dhatu=Dhatu(aupadeshika='ga\\mx~', gana=Gana.Bhvadi), "
"clean_text='gam')"
"clean_text='gam', artha_sa='gatO')"
)


Expand All @@ -70,6 +71,7 @@ def test_pratipadika_entry__krdanta():
assert gata.krt == Krt.kta
assert gata.prayoga is None
assert gata.lakara is None
assert gata.lingas == [Linga.Pum, Linga.Stri, Linga.Napumsaka]

v = Vyakarana()
results = {p.text for p in v.derive(gata)}
Expand Down Expand Up @@ -99,7 +101,7 @@ def test_pratipadika_entry__dunders():

assert repr(gata_entry) == (
"PratipadikaEntry.Krdanta(dhatu_entry=DhatuEntry(dhatu="
"Dhatu(aupadeshika='ga\\mx~', gana=Gana.Bhvadi), clean_text='gam'), "
"Dhatu(aupadeshika='ga\\mx~', gana=Gana.Bhvadi), clean_text='gam', artha_sa=None), "
"krt=Krt.kta, prayoga=None, lakara=None)"
)

Expand Down Expand Up @@ -202,7 +204,7 @@ def test_pada_entry__dunders():

assert repr(gacchati_pada) == (
"PadaEntry.Tinanta(dhatu_entry=DhatuEntry(dhatu="
"Dhatu(aupadeshika='ga\\mx~', gana=Gana.Bhvadi), clean_text='gam'), "
"Dhatu(aupadeshika='ga\\mx~', gana=Gana.Bhvadi), clean_text='gam', artha_sa=None), "
"prayoga=Prayoga.Kartari, lakara=Lakara.Lat, purusha=Purusha.Prathama, vacana=Vacana.Eka)"
)

Expand Down
6 changes: 5 additions & 1 deletion bindings-python/test/unit/kosha/test_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,11 @@ def test_getitem(kosha):
assert kosha["gacCati"] is not None

with pytest.raises(KeyError):
x = kosha["missing"]
_ = kosha["missing"]


def test_len(kosha):
assert len(kosha) == 3


def test_repr(kosha):
Expand Down
4 changes: 3 additions & 1 deletion bindings-python/vidyut/docs/source/kosha.rst
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,9 @@ passed to :meth:`vidyut.prakriya.Vyakarana.derive`:
`vidyut.kosha` types contain useful metadata:

- :class:`DhatuEntry` contains `clean_text`, which is the dictionary version
of the dhatu with sandhi applied and accent marks removed.
of the dhatu with sandhi applied and accent marks removed. It also contains
meanings in Sanskrit (`artha_sa`), English (`artha_en`), and Hindi (`artha_hi`)
as well as some other metadata.

- :class:`PratipadikaEntry` contains `lingas`, which includes the lingas
typcially used with this pratipadika.
Expand Down
1 change: 1 addition & 0 deletions vidyut-data/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@ clap.workspace = true
csv = "1.3.1"
fst = "0.4.7"
regex = "1.11.1"
serde.workspace = true
1 change: 1 addition & 0 deletions vidyut-data/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ create_sandhi_rules:

# Creates a kosha and write it to disk.
create_kosha:
# cd scripts && uv run fetch_dhatu_metadata.py > ../data/raw/lex/dhatu-metadata.csv
RUST_BACKTRACE=1 RUST_LOG=info cargo run --release --bin create_kosha -- \
--input-dir data/raw/lex \
--dhatupatha ../vidyut-prakriya/data/dhatupatha.tsv \
Expand Down
1 change: 1 addition & 0 deletions vidyut-data/scripts/.python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.13
4 changes: 4 additions & 0 deletions vidyut-data/scripts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Data scripts
============

Simple utility scripts for creating Sanskrit data.
48 changes: 48 additions & 0 deletions vidyut-data/scripts/fetch_dhatu_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""Creates dhatu metadata based on data from ashtadhyayi.com.
Usage:
uv run fetch_dhatu_metadata.py
"""


import csv
import io
import json
import pprint
import urllib.request
from vidyut.lipi import transliterate, Scheme


def load_metadata() -> dict:
url = "https://github.com/ashtadhyayi-com/data/raw/refs/heads/master/dhatu/data.txt"
f = urllib.request.urlopen(url)
return json.load(f)


data = load_metadata()
dhatus = data["data"]

out = io.StringIO()
w = csv.writer(out)
w.writerow(["code", "artha_en", "artha_hi", "karma", "pada", "settva"])
for dhatu in dhatus:
artha_en = dhatu["artha_english"]
artha_hi = dhatu["artha_hindi"]
code = dhatu["baseindex"]
karma = dhatu["karma"]
pada = dhatu["pada"]
settva = dhatu["settva"]

assert karma in {"S", "A", "D", '-'}, karma
assert pada in {"P", "A", "U", '-'}, pada
assert settva in {"S", "A", "V", '-'}, settva

if karma == '-':
assert karma == pada == settva == '-'
continue

w.writerow((code, artha_en, artha_hi, karma, pada, settva))

text = out.getvalue()
print(text)
9 changes: 9 additions & 0 deletions vidyut-data/scripts/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[project]
name = "scripts"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.13"
dependencies = [
"vidyut>=0.3.1",
]
Loading

0 comments on commit 1426587

Please sign in to comment.