Source code for pyigt.lgrmorphemes

"""
`pyigt` supports the notation for morpheme/gloss structure proposed by the
`Leipzig Glossing Rules <https://www.eva.mpg.de/lingua/resources/glossing-rules.php>`_.

According to LGR Rule 1, object language and gloss lines have to be word-aligned. Such aligned
pairs of a word and a corresponding gloss are modeled via the :class:`GlossedWord` class.

If an IGT conforms to Rule 2, glossed words are lists of aligned
:class:`~pyigt.lgrmorphemes.GlossedMorpheme` pairs.

The provisions of Rule 4 (and following), i.e. the structure of morpheme glosses, is implemented
as subclasses of :class:`GlossElement`.
"""
import re
import itertools
from typing import Optional
import dataclasses
import unicodedata
from collections.abc import Generator

from pyigt.util import is_standard_abbr, is_generic_abbr

__all__ = [
    # Types of morpheme gloss elements:
    'GlossElement', 'Infix', 'GlossElementAfterSemicolon', 'GlossElementAfterColon',
    'GlossElementAfterBackslash', 'PatientlikeArgument', 'NonovertElement', 'InherentCategory',
    # Types of morphemes:
    'Morpheme', 'MORPHEME_SEPARATORS', 'split_morphemes', 'remove_morpheme_separators',
    # Wrapper
    'GlossedWord', 'GlossedMorpheme',
]
MORPHEME_SEPARATORS = [
    '-',  # Rule 2
    '=',  # Rule 2, clitics
    '~',  # Rule 10
]


[docs]def split_morphemes(s: str) -> list[str]: """Split string into morphemes.""" pattern = f"({'|'.join(re.escape(c) for c in MORPHEME_SEPARATORS)})" return re.split(pattern, s or '')
[docs]def remove_morpheme_separators(s: str) -> str: """Remove all characters listed as morpheme separators from string.""" return ''.join(ss for ss in split_morphemes(s) if ss not in MORPHEME_SEPARATORS)
[docs]class GlossElement(str): """ Rule 4. Gloss elements are separated by ".". :ivar start: Specifies the separator to use when combining a `GlossElement` with another. """ start = '.' end = None in_gloss_only = True def __init__(self, _): self.prev = None self.next = None def __repr__(self): text = self.encode('ascii', 'replace').decode() return f'<{self.__class__.__name__} "{text}">' @property def is_agentlike_argument(self) -> bool: # pylint: disable=C0116 return isinstance(self.next, PatientlikeArgument) @property def is_standard_abbreviation(self) -> bool: # pylint: disable=C0116 return is_standard_abbr(self) @property def is_category_label(self) -> bool: # pylint: disable=C0116 return is_generic_abbr(self)
[docs]class Infix(GlossElement, str): """ Rule 9. Infixes are enclosed in angle brackets. """ start = '<' end = '>' in_gloss_only = False
[docs]class GlossElementAfterSemicolon(GlossElement): """ Rule 4B. Distinct gloss elements can be separated by ";". """ start = ';'
[docs]class GlossElementAfterColon(GlossElement): """ Rule 4C. Gloss element corresponding to "hidden" object language elements are separated by ":". """ start = ':'
[docs]class GlossElementAfterBackslash(GlossElement): """ Rule 4D. Morphophonological change is marked with a leading "\\". """ start = '\\'
[docs]class PatientlikeArgument(GlossElement): """ Rule 4E. Patient-like arguments are marked with a leading ">". Note: Infer the agent-like argument by looking up the `prev` property. """ start = '>'
[docs]class NonovertElement(GlossElement): """ Rule 6. Non-overt elements can be enclosed in square brackets. """ start = '[' end = ']'
[docs]class InherentCategory(GlossElement): """ Rule 7. Inherent categories can be enclosed in round brackets. """ start = '(' end = ')'
class GlossElements(list): """ A container class for a list of `GlossElement` instances, together with functionality to round-trip from `str`. """ def __str__(self): s, prev_enclosed = '', False for ge in self: if prev_enclosed and ge.end: # Another enclosed element! assert prev_enclosed == ge.end if s: # Remove the prematurely appended end marker: s = s[:-1] s += GlossElement.start s += ge s += ge.end else: if (s and not prev_enclosed) or ge.end: s += ge.start s += str(ge) if ge.end: s += ge.end prev_enclosed = ge.end return s @staticmethod def _iter_gloss_elements(s, type_) -> Generator[GlossElement, None, None]: classes = {GlossElement.start: GlossElement} if type_ == 'gloss' else {} for cls in GlossElement.__subclasses__(): if (not cls.in_gloss_only) or type_ == 'gloss': assert cls.start not in classes classes[cls.start] = cls e, cls = '', GlossElement s = list(reversed(s)) while s: c = s.pop() if c in classes: if e: # Note: We allow the complete morpheme gloss to start with a separator! # That is required for infixes, but otherwise not mentioned in LGR. yield cls(e) e, cls = '', classes[c] if s and cls.end: # Consume the characters up to the end marker. cc = s.pop() while s and (cc != cls.end): e += cc cc = s.pop() for ee in e.split(GlossElement.start): yield cls(ee) e, cls = '', GlossElement else: e += c if e: yield cls(e) @classmethod def from_morpheme(cls, s: str, type_) -> 'GlossElements': """Instantiate gloss elements from a string.""" res, prev = [], None for ge in GlossElements._iter_gloss_elements(s, type_): if prev: ge.prev = prev prev.next = ge res.append(ge) prev = ge return cls(res)
[docs]class Morpheme(str): """ Rule 2. Morphemes are separated by "-". """ sep = '-' def __init__(self, _): self.type = None def __repr__(self): morph = self.encode('ascii', 'replace').decode() return f'<{self.__class__.__name__} "{morph}">' @property def elements(self) -> list[GlossElement]: """ >>> m = Morpheme('a<b>c') >>> m.elements [<GlossElement "a">, <Infix "b">, <GlossElement "c">] """ return GlossElements.from_morpheme(str(self), self.type) @property def form_and_infixes(self) -> tuple[str, list[str]]: """ >>> m = Morpheme('a<b>c') >>> m.form_and_infixes ('ac', ['b']) """ form, infixes = '', [] for ge in self.elements: if isinstance(ge, Infix): infixes.append(str(ge)) else: form += str(ge) return form, infixes
[docs]@dataclasses.dataclass class GlossedMorpheme: """ A (morpheme, gloss) pair. :ivar morpheme: The morpheme form. :ivar gloss: The literal gloss. :ivar sep: The morpheme separator preceding this morpheme. :ivar prev: Points to the previous `GlossedMorpheme` in a word, or `None`. :ivar next: Points to the next `GlossedMorpheme` in a word, or `None`. """ morpheme: Morpheme gloss: Morpheme sep: str prev: Optional['GlossedMorpheme'] = None next: Optional['GlossedMorpheme'] = None def __post_init__(self): self.morpheme = Morpheme(self.morpheme) self.morpheme.type = 'word' self.gloss = Morpheme(self.gloss) self.gloss.type = 'gloss' def __eq__(self, other): return self.morpheme == other.morpheme and self.gloss == other.gloss def __repr__(self): return f'<{self.__class__.__name__} morpheme={self.morpheme} gloss={self.gloss}>' @property def form(self) -> str: """ Removes sentence-level markup (i.e. punctuation etc.) from `.morpheme`. .. code-block:: python >>> from pyigt.lgrmorphemes import GlossedMorpheme >>> gm = GlossedMorpheme(morpheme='"[ab.c', gloss="abc", sep='-') >>> gm.form 'abc' """ return ''.join( c for c in self.morpheme if unicodedata.category(c) not in {'Po', 'Pf', 'Ps', 'Pd', 'Pe', 'Pi', 'Sm'}) @property def first(self) -> bool: """Whether the morpheme is the first in the word.""" return not bool(self.prev) @property def last(self) -> bool: """Whether the morpheme is the last in the word.""" return not bool(self.next) @property def grammatical_concepts(self) -> list[str]: """ Grammatical concepts, referenced with category labels according to Rule 3, used in morpheme gloss. .. note:: Gloss element separators according to Rule 4B and 4C are interpreted as signaling a separate concept. .. code-block:: python >>> from pyigt.lgrmorphemes import GlossedMorpheme >>> gm = GlossedMorpheme(morpheme='abc', gloss='ABC.DEF:GHI;JKL', sep='.') >>> gm.grammatical_concepts ['ABC.DEF', 'GHI', 'JKL'] """ return list(self._glosses('grammatical')) @property def lexical_concepts(self) -> list[str]: """ Gloss elements not recognized as category labels are interpreted as lexical concepts. .. code-block:: python >>> from pyigt.lgrmorphemes import GlossedMorpheme >>> gm = GlossedMorpheme(morpheme='çık', gloss='come_out', sep='.') >>> gm.lexical_concepts ['come out'] """ return list(self._glosses('lexical')) def _glosses(self, type_): s = '' for ge in self.gloss.elements: if isinstance(ge, (GlossElementAfterColon, GlossElementAfterSemicolon)): # Something new is starting. if s: yield s.replace('_', ' ') s = '' if (type_ == 'lexical' and not ge.is_category_label) or \ (type_ == 'grammatical' and ge.is_category_label): s = str(ge) else: if (type_ == 'lexical' and not ge.is_category_label) or \ (type_ == 'grammatical' and ge.is_category_label): if s: s += ge.start if ge.is_category_label else ' ' s += str(ge) if s: yield s.replace('_', ' ')
[docs]@dataclasses.dataclass class GlossedWord: """ A (word, gloss) pair, corresponding to two aligned items from IGT according to LGR. Provides list-like access to its :class:`GlossedMorpheme` s. """ word: str gloss: str glossed_morphemes: list[GlossedMorpheme] = dataclasses.field(default_factory=list) strict: bool = False def __post_init__(self): mm, gg = split_morphemes(self.word), split_morphemes(self.gloss) if len(mm) != len(gg): if self.strict: raise ValueError(f'Morpheme separator mismatch: {self.word} :: {self.gloss}') self.is_valid = False sep, prev = None, None for m, g in zip(mm, gg): if not m and not g: continue # Morpheme starts or ends with separator if m in MORPHEME_SEPARATORS: if m != g: if self.strict: raise ValueError( f'Morpheme separator mismatch: {self.word} :: {self.gloss}') self.is_valid = False break sep = m else: assert m and g, (mm, g) gm = GlossedMorpheme(m, g, sep=sep) self.glossed_morphemes.append(gm) if prev: prev.next = gm gm.prev = prev prev = gm def __eq__(self, other): return self.glossed_morphemes == other.glossed_morphemes def __repr__(self): return f'<{self.__class__.__name__} word={self.word} gloss={self.gloss}>' def __iter__(self): return iter(self.glossed_morphemes) def __getitem__(self, item): return self.glossed_morphemes[item] def __len__(self): return len(self.glossed_morphemes) @property def form(self) -> str: """ Removes sentence-level markup and morpheme separators from `.word`. .. code-block:: python >>> from pyigt.lgrmorphemes import GlossedWord >>> gw = GlossedWord(word='"An-fangs', gloss="a-b") >>> gw.form 'Anfangs' """ return ''.join(gm.form for gm in self) @property def word_from_morphemes(self) -> str: """ >>> gw = GlossedWord('a-word', 'a.DU-gloss') >>> gw.word_from_morphemes 'a-word' """ return ''.join(itertools.chain( *[(gm.sep if gm.prev else '', str(gm.morpheme.elements)) for gm in self])) @property def gloss_from_morphemes(self) -> str: """ >>> gw = GlossedWord('a-word', 'a.DU-gloss') >>> gw.gloss_from_morphemes 'a.DU-gloss' """ return ''.join(itertools.chain( *[(gm.sep if gm.prev else '', str(gm.gloss.elements)) for gm in self]))