"""
`pyigt` supports the notation for morpheme/gloss structure proposed by the
`Leipzig Glossing Rules <https://www.eva.mpg.de/lingua/resources/glossing-rules.php>`_.
According to LGR Rule 1, object language and gloss lines have to be word-aligned. Such aligned
pairs of a word and a corresponding gloss are modeled via the :class:`GlossedWord` class.
If an IGT conforms to Rule 2, glossed words are lists of aligned
:class:`~pyigt.lgrmorphemes.GlossedMorpheme` pairs.
The provisions of Rule 4 (and following), i.e. the structure of morpheme glosses, is implemented
as subclasses of :class:`GlossElement`.
"""
import re
import itertools
from typing import Optional
import dataclasses
import unicodedata
from collections.abc import Generator
from pyigt.util import is_standard_abbr, is_generic_abbr
__all__ = [
# Types of morpheme gloss elements:
'GlossElement', 'Infix', 'GlossElementAfterSemicolon', 'GlossElementAfterColon',
'GlossElementAfterBackslash', 'PatientlikeArgument', 'NonovertElement', 'InherentCategory',
# Types of morphemes:
'Morpheme', 'MORPHEME_SEPARATORS', 'split_morphemes', 'remove_morpheme_separators',
# Wrapper
'GlossedWord', 'GlossedMorpheme',
]
MORPHEME_SEPARATORS = [
'-', # Rule 2
'=', # Rule 2, clitics
'~', # Rule 10
]
[docs]def split_morphemes(s: str) -> list[str]:
"""Split string into morphemes."""
pattern = f"({'|'.join(re.escape(c) for c in MORPHEME_SEPARATORS)})"
return re.split(pattern, s or '')
[docs]def remove_morpheme_separators(s: str) -> str:
"""Remove all characters listed as morpheme separators from string."""
return ''.join(ss for ss in split_morphemes(s) if ss not in MORPHEME_SEPARATORS)
[docs]class GlossElement(str):
"""
Rule 4. Gloss elements are separated by ".".
:ivar start: Specifies the separator to use when combining a `GlossElement` with another.
"""
start = '.'
end = None
in_gloss_only = True
def __init__(self, _):
self.prev = None
self.next = None
def __repr__(self):
text = self.encode('ascii', 'replace').decode()
return f'<{self.__class__.__name__} "{text}">'
@property
def is_agentlike_argument(self) -> bool: # pylint: disable=C0116
return isinstance(self.next, PatientlikeArgument)
@property
def is_standard_abbreviation(self) -> bool: # pylint: disable=C0116
return is_standard_abbr(self)
@property
def is_category_label(self) -> bool: # pylint: disable=C0116
return is_generic_abbr(self)
[docs]class Infix(GlossElement, str):
"""
Rule 9. Infixes are enclosed in angle brackets.
"""
start = '<'
end = '>'
in_gloss_only = False
[docs]class GlossElementAfterSemicolon(GlossElement):
"""
Rule 4B. Distinct gloss elements can be separated by ";".
"""
start = ';'
[docs]class GlossElementAfterColon(GlossElement):
"""
Rule 4C. Gloss element corresponding to "hidden" object language elements are separated by ":".
"""
start = ':'
[docs]class GlossElementAfterBackslash(GlossElement):
"""
Rule 4D. Morphophonological change is marked with a leading "\\".
"""
start = '\\'
[docs]class PatientlikeArgument(GlossElement):
"""
Rule 4E. Patient-like arguments are marked with a leading ">".
Note: Infer the agent-like argument by looking up the `prev` property.
"""
start = '>'
[docs]class NonovertElement(GlossElement):
"""
Rule 6. Non-overt elements can be enclosed in square brackets.
"""
start = '['
end = ']'
[docs]class InherentCategory(GlossElement):
"""
Rule 7. Inherent categories can be enclosed in round brackets.
"""
start = '('
end = ')'
class GlossElements(list):
"""
A container class for a list of `GlossElement` instances, together with functionality to
round-trip from `str`.
"""
def __str__(self):
s, prev_enclosed = '', False
for ge in self:
if prev_enclosed and ge.end:
# Another enclosed element!
assert prev_enclosed == ge.end
if s:
# Remove the prematurely appended end marker:
s = s[:-1]
s += GlossElement.start
s += ge
s += ge.end
else:
if (s and not prev_enclosed) or ge.end:
s += ge.start
s += str(ge)
if ge.end:
s += ge.end
prev_enclosed = ge.end
return s
@staticmethod
def _iter_gloss_elements(s, type_) -> Generator[GlossElement, None, None]:
classes = {GlossElement.start: GlossElement} if type_ == 'gloss' else {}
for cls in GlossElement.__subclasses__():
if (not cls.in_gloss_only) or type_ == 'gloss':
assert cls.start not in classes
classes[cls.start] = cls
e, cls = '', GlossElement
s = list(reversed(s))
while s:
c = s.pop()
if c in classes:
if e:
# Note: We allow the complete morpheme gloss to start with a separator!
# That is required for infixes, but otherwise not mentioned in LGR.
yield cls(e)
e, cls = '', classes[c]
if s and cls.end: # Consume the characters up to the end marker.
cc = s.pop()
while s and (cc != cls.end):
e += cc
cc = s.pop()
for ee in e.split(GlossElement.start):
yield cls(ee)
e, cls = '', GlossElement
else:
e += c
if e:
yield cls(e)
@classmethod
def from_morpheme(cls, s: str, type_) -> 'GlossElements':
"""Instantiate gloss elements from a string."""
res, prev = [], None
for ge in GlossElements._iter_gloss_elements(s, type_):
if prev:
ge.prev = prev
prev.next = ge
res.append(ge)
prev = ge
return cls(res)
[docs]class Morpheme(str):
"""
Rule 2. Morphemes are separated by "-".
"""
sep = '-'
def __init__(self, _):
self.type = None
def __repr__(self):
morph = self.encode('ascii', 'replace').decode()
return f'<{self.__class__.__name__} "{morph}">'
@property
def elements(self) -> list[GlossElement]:
"""
>>> m = Morpheme('a<b>c')
>>> m.elements
[<GlossElement "a">, <Infix "b">, <GlossElement "c">]
"""
return GlossElements.from_morpheme(str(self), self.type)
@property
def form_and_infixes(self) -> tuple[str, list[str]]:
"""
>>> m = Morpheme('a<b>c')
>>> m.form_and_infixes
('ac', ['b'])
"""
form, infixes = '', []
for ge in self.elements:
if isinstance(ge, Infix):
infixes.append(str(ge))
else:
form += str(ge)
return form, infixes
[docs]@dataclasses.dataclass
class GlossedMorpheme:
"""
A (morpheme, gloss) pair.
:ivar morpheme: The morpheme form.
:ivar gloss: The literal gloss.
:ivar sep: The morpheme separator preceding this morpheme.
:ivar prev: Points to the previous `GlossedMorpheme` in a word, or `None`.
:ivar next: Points to the next `GlossedMorpheme` in a word, or `None`.
"""
morpheme: Morpheme
gloss: Morpheme
sep: str
prev: Optional['GlossedMorpheme'] = None
next: Optional['GlossedMorpheme'] = None
def __post_init__(self):
self.morpheme = Morpheme(self.morpheme)
self.morpheme.type = 'word'
self.gloss = Morpheme(self.gloss)
self.gloss.type = 'gloss'
def __eq__(self, other):
return self.morpheme == other.morpheme and self.gloss == other.gloss
def __repr__(self):
return f'<{self.__class__.__name__} morpheme={self.morpheme} gloss={self.gloss}>'
@property
def form(self) -> str:
"""
Removes sentence-level markup (i.e. punctuation etc.) from `.morpheme`.
.. code-block:: python
>>> from pyigt.lgrmorphemes import GlossedMorpheme
>>> gm = GlossedMorpheme(morpheme='"[ab.c', gloss="abc", sep='-')
>>> gm.form
'abc'
"""
return ''.join(
c for c in self.morpheme if
unicodedata.category(c) not in {'Po', 'Pf', 'Ps', 'Pd', 'Pe', 'Pi', 'Sm'})
@property
def first(self) -> bool:
"""Whether the morpheme is the first in the word."""
return not bool(self.prev)
@property
def last(self) -> bool:
"""Whether the morpheme is the last in the word."""
return not bool(self.next)
@property
def grammatical_concepts(self) -> list[str]:
"""
Grammatical concepts, referenced with category labels according to Rule 3, used in morpheme
gloss.
.. note::
Gloss element separators according to Rule 4B and 4C are interpreted as signaling a
separate concept.
.. code-block:: python
>>> from pyigt.lgrmorphemes import GlossedMorpheme
>>> gm = GlossedMorpheme(morpheme='abc', gloss='ABC.DEF:GHI;JKL', sep='.')
>>> gm.grammatical_concepts
['ABC.DEF', 'GHI', 'JKL']
"""
return list(self._glosses('grammatical'))
@property
def lexical_concepts(self) -> list[str]:
"""
Gloss elements not recognized as category labels are interpreted as lexical concepts.
.. code-block:: python
>>> from pyigt.lgrmorphemes import GlossedMorpheme
>>> gm = GlossedMorpheme(morpheme='çık', gloss='come_out', sep='.')
>>> gm.lexical_concepts
['come out']
"""
return list(self._glosses('lexical'))
def _glosses(self, type_):
s = ''
for ge in self.gloss.elements:
if isinstance(ge, (GlossElementAfterColon, GlossElementAfterSemicolon)):
# Something new is starting.
if s:
yield s.replace('_', ' ')
s = ''
if (type_ == 'lexical' and not ge.is_category_label) or \
(type_ == 'grammatical' and ge.is_category_label):
s = str(ge)
else:
if (type_ == 'lexical' and not ge.is_category_label) or \
(type_ == 'grammatical' and ge.is_category_label):
if s:
s += ge.start if ge.is_category_label else ' '
s += str(ge)
if s:
yield s.replace('_', ' ')
[docs]@dataclasses.dataclass
class GlossedWord:
"""
A (word, gloss) pair, corresponding to two aligned items from IGT according to LGR.
Provides list-like access to its :class:`GlossedMorpheme` s.
"""
word: str
gloss: str
glossed_morphemes: list[GlossedMorpheme] = dataclasses.field(default_factory=list)
strict: bool = False
def __post_init__(self):
mm, gg = split_morphemes(self.word), split_morphemes(self.gloss)
if len(mm) != len(gg):
if self.strict:
raise ValueError(f'Morpheme separator mismatch: {self.word} :: {self.gloss}')
self.is_valid = False
sep, prev = None, None
for m, g in zip(mm, gg):
if not m and not g:
continue # Morpheme starts or ends with separator
if m in MORPHEME_SEPARATORS:
if m != g:
if self.strict:
raise ValueError(
f'Morpheme separator mismatch: {self.word} :: {self.gloss}')
self.is_valid = False
break
sep = m
else:
assert m and g, (mm, g)
gm = GlossedMorpheme(m, g, sep=sep)
self.glossed_morphemes.append(gm)
if prev:
prev.next = gm
gm.prev = prev
prev = gm
def __eq__(self, other):
return self.glossed_morphemes == other.glossed_morphemes
def __repr__(self):
return f'<{self.__class__.__name__} word={self.word} gloss={self.gloss}>'
def __iter__(self):
return iter(self.glossed_morphemes)
def __getitem__(self, item):
return self.glossed_morphemes[item]
def __len__(self):
return len(self.glossed_morphemes)
@property
def form(self) -> str:
"""
Removes sentence-level markup and morpheme separators from `.word`.
.. code-block:: python
>>> from pyigt.lgrmorphemes import GlossedWord
>>> gw = GlossedWord(word='"An-fangs', gloss="a-b")
>>> gw.form
'Anfangs'
"""
return ''.join(gm.form for gm in self)
@property
def word_from_morphemes(self) -> str:
"""
>>> gw = GlossedWord('a-word', 'a.DU-gloss')
>>> gw.word_from_morphemes
'a-word'
"""
return ''.join(itertools.chain(
*[(gm.sep if gm.prev else '', str(gm.morpheme.elements)) for gm in self]))
@property
def gloss_from_morphemes(self) -> str:
"""
>>> gw = GlossedWord('a-word', 'a.DU-gloss')
>>> gw.gloss_from_morphemes
'a.DU-gloss'
"""
return ''.join(itertools.chain(
*[(gm.sep if gm.prev else '', str(gm.gloss.elements)) for gm in self]))