Source code for pyigt.igt

"""
Functionality related to interlinear glossed text.
"""
import functools
import re
import enum
import types
from typing import Optional, Union, Literal, Callable, Any
import pathlib
import tempfile
import itertools
import collections
from collections.abc import Iterable, Generator
import dataclasses
import unicodedata

import segments
from csvw.dsv import UnicodeWriter, reader
from csvw.metadata import Link, TableGroup
from pycldf import Dataset
from pycldf import orm
import pycldf

try:
    import lingpy
except ImportError:  # pragma: no cover
    lingpy = False

from pyigt.util import expand_standard_abbr, align
from pyigt.lgrmorphemes import (
    GlossedWord, split_morphemes, remove_morpheme_separators, GlossedMorpheme
)

__all__ = ['IGT', 'Corpus', 'LGRConformance', 'Example', 'LingPySettings']

NON_OVERT_ELEMENT = '∅'
ConcordanceType = Literal["grammar", "lexicon", "form"]


def with_lingpy():  # pylint: disable=C0116
    if not lingpy:
        raise ValueError('pyigt must be installed with lingpy support for this functionality! '
                         'Run `pip install pyigt[lingpy]`')
    return lingpy


[docs]@enum.unique
class LGRConformance(enum.IntEnum):
    """
    Conformance levels with respect to alignment of phrase and gloss of an `IGT`.

    We distinguish the following levels:

    - morpheme-aligned (IGT conforms to LGR Rule 2)
    - word-aligned (IGT conforms to LGR Rule 1, but not Rule 2)
    - unaligned (IGT does not conform to LGR Rule 1)
    """
    MORPHEME_ALIGNED = 2
    WORD_ALIGNED = 1
    UNALIGNED = 0


def parse_phrase(p):
    """
    We must take LGR Rule 2A into account, i.e. attach morphemes separated by " -" to the
    preceding word.
    """
    if isinstance(p, str):
        rule2a = re.compile(r'([^\s]+) -')
        return [
            w.replace('|||', ' ') for w in rule2a.sub(lambda m: m.groups()[0] + '|||-', p).split()]
    return p


[docs]@dataclasses.dataclass
class IGT:  # pylint: disable=R0902
    """
    The main trait of IGT is the alignment of words and glosses. Thus, we are mostly interested
    in the two aligned "lines": the analyzed text and the glosses, rather than trying to support
    any number of tiers, and alignment based on timestamps or similar.
    Thus, an `IGT` instance is a `list` of aligned words, and each aligned word a `list` of aligned
    morphemes. This structure can be exploited to access parts of the alignment, see
    :meth:`IGT.__getitem__`

    :ivar phrase: `list` of `str` representing the gloss-aligned words of the IGT.
    :ivar gloss: `list` of `str` representing the word-aligned glosses of the IGT.
    :ivar id: Optional identifier, can be used for referencing the `IGT` if it part of a `Corpus`.
    :ivar properties: `typing.Dict[str, object]` storing additional properties of an `IGT`, e.g. \
    additional column values read from a row in a CLDF ExampleTable.
    :ivar language: Optional language identifier, specifying the object language of the `IGT`.
    :ivar translation: Optional translation of the phrase.
    :ivar abbrs: Optional `dict` providing descriptions of gloss labels used in the `IGT`.
    :ivar strict: `bool` flag signaling whether to parse the `IGT` in strict mode, i.e. requiring \
    matching morpheme separators in phrase and gloss, or not.

    .. note::

        **LGR Conformance**

        While the main purpose of an `IGT` is providing access to its words, morphemes and glosses,
        it also supports error/conformance checking. Thus, it is possible to initialize an `IGT`
        with "broken" data.

        .. code-block:: python

            >>> from pyigt import IGT
            >>> igt = IGT(phrase='two words', gloss='ONE.GLOSS')
            >>> igt.conformance
            <LGRConformance.UNALIGNED: 0>

        So before processing `IGT` instances, it should be checked whether the conformance level
        (see :class:`LGRConformance`) of the `IGT` is sufficient for the downstream requirements.
        Otherwise, accessing properties like :meth:`IGT.glossed_words` may lead to unexpected
        results:

        .. code-block:: python

            >>> igt.glossed_words  # we extract as many glossed words as possible ...
            [<GlossedWord word=a gloss=C>]
            >>> len(igt)
            1
            >>> len(igt.phrase)
            2
            >>> igt = IGT(phrase='multi-morph', gloss='GLOSS')
            >>> igt.conformance
            <LGRConformance.WORD_ALIGNED: 1>
            >>> igt[0].glossed_morphemes  # we extract as many glossed morphemes as possible ...
            [<GlossedMorpheme morpheme=multi gloss=GLOSS>]
    """
    phrase: list[str]
    gloss: list[str]
    id: Optional[str] = None
    properties: dict = dataclasses.field(default_factory=dict)
    language: str = None
    translation: str = None
    abbrs: dict = dataclasses.field(default_factory=dict)
    strict: bool = False

    def __post_init__(self):
        self.phrase = parse_phrase(self.phrase)
        self.gloss = self.gloss.split() if isinstance(self.gloss, str) else self.gloss

        if self.translation:
            p = re.compile(r'\((?P<abbrs>((\s*,\s*)?[A-Z][A-Z0-9]*\s*=\s*[^,)]+)+)\)')
            abbrs = p.search(self.translation)
            if abbrs:
                for abbr in abbrs.group('abbrs').split(','):
                    abbr, _, label = abbr.partition('=')
                    self.abbrs[abbr.strip()] = label.strip()
                self.translation = p.sub('', self.translation).strip()
            if self.translation[0] == "'" or unicodedata.category(self.translation[0]) == 'Pi':
                # Punctuation, Initial quote
                self.translation = self.translation[1:].strip()
            if self.translation[-1] == "'" or \
                    unicodedata.category(self.translation[-1]) == 'Pf':
                # Punctuation, Final quote
                self.translation = self.translation[:-1].strip()

    def __len__(self):
        return len(self.glossed_words)

    def __iter__(self):
        yield from self.glossed_words

    @property
    def glossed_words(self) -> list[GlossedWord]:  # pylint: disable=C0116
        return [GlossedWord(w, g, strict=self.strict) for w, g in zip(self.phrase, self.gloss)]

    @property
    def prosodic_words(self) -> list[GlossedWord]:
        """
        Interpret an IGT's phrase prosodically, i.e.

        1. splits prosodically free elements marked with " -" separator and
        2. conflates clitics.

        Use :meth:`IGT.as_prosodic` to get an `IGT` instance initialised from the prosodic words
        of an `IGT` instance.
        """
        res = []
        for w, g in zip(self.phrase, self.gloss):
            word, gloss = '', ''
            morphemes = split_morphemes(w)
            morpheme_glosses = split_morphemes(g)
            for wm, gm in zip(morphemes, morpheme_glosses):
                if wm == '-' and word and word[-1] == ' ':
                    assert gm == '-'
                    res.append(GlossedWord(word.strip(), gloss, strict=self.strict))
                    word, gloss = '', ''
                else:
                    word += wm
                    gloss += gm
            if word:
                res.append(GlossedWord(word, gloss, strict=self.strict))
        return res

    @property
    def morphosyntactic_words(self) -> list[GlossedWord]:
        """
        Interpret an IGT's phrase morphosyntactically, i.e.

        1. conflate prosodically free elements marked with " -" separator and
        2. split clitics into separate words.

        Use :meth:`IGT.as_morphosyntactic` to get an `IGT` instance initialised from the
        morphosyntactic words of an `IGT` instance.
        """
        res = []
        for w, g in zip(self.phrase, self.gloss):
            res.extend([
                GlossedWord(ww, gg, strict=self.strict)
                for ww, gg in zip(w.split('='), g.split('='))])
        return res

[docs]    def as_prosodic(self) -> 'IGT':
        """
        .. code-block:: python

            >>> from pyigt import IGT
            >>> igt = IGT(phrase='a=bcd -e', gloss='A=BCD-E')
            >>> len(igt) != len(igt.as_prosodic())
            True
            >>> igt[0].word
            'a=bcd -e'
            >>> igt.as_prosodic()[0].word
            'a=bcd'
        """
        return IGT(
            phrase=[gw.word for gw in self.prosodic_words],
            gloss=[gw.gloss for gw in self.prosodic_words],
            id=self.id,
            properties=self.properties,
            language=self.language,
            translation=self.translation,
            abbrs=self.abbrs,
            strict=self.strict,
        )

[docs]    def as_morphosyntactic(self):
        """
        .. code-block:: python

            >>> from pyigt import IGT
            >>> igt = IGT(phrase='a=bcd -e', gloss='A=BCD-E')
            >>> len(igt) != len(igt.as_morphosyntactic())
            True
            >>> igt[0].word
            'a=bcd -e'
            >>> igt.as_morphosyntactic()[-1].word
            'bcd -e'
        """
        return IGT(
            phrase=[gw.word for gw in self.morphosyntactic_words],
            gloss=[gw.gloss for gw in self.morphosyntactic_words],
            id=self.id,
            properties=self.properties,
            language=self.language,
            translation=self.translation,
            abbrs=self.abbrs,
            strict=self.strict,
        )

    @property
    def gloss_abbrs(self) -> collections.OrderedDict:
        """The gloss abbreviations used in the IGT."""
        res = collections.OrderedDict()
        for gw in self.glossed_words:
            for gm in gw:
                for element in gm.gloss.elements:
                    # We disregard "I".
                    if element != 'I' and element.is_category_label:
                        if element in self.abbrs:
                            res[element] = self.abbrs[element]
                        else:
                            desc = expand_standard_abbr(element)
                            res[element] = desc if desc != element else None
        return res

    def __str__(self):
        """
        A plain text representation of the IGT, to be viewed with a monospaced font to make
        alignments work.
        """
        aligned = align(self.gloss, self.phrase)
        translation = f'\n‘{self.translation}’' if self.translation else ''
        return f'{self.primary_text}\n{aligned}{translation}'

    def pprint(self):  # pylint: disable=C0116
        abbrs = [(k, v) for k, v in self.gloss_abbrs.items() if v]
        if abbrs:
            mlen = max(len(a[0]) for a in abbrs)
            abbrs = ''.join(f'\n  {k.ljust(mlen)} = {v}' for k, v in abbrs)
        print(f"{self}{abbrs or ''}")

[docs]    def __getitem__(
            self,
            i: Union[int, tuple[int, Union[int, slice]]],
    ) -> Union[list, GlossedWord, GlossedMorpheme]:
        """
        Provide access to `GlossedWord` or `GlossedMorpheme` (s) by zero-based index.

        :param i: An `int` index to reference a `GlossedWord` or a (`int`, `int`) tuple,\
        referencing a `GlossedMorpheme`.

        .. code-block:: python

            >>> from pyigt import IGT
            >>> igt = IGT(phrase="zəp-le: ȵi-ke: pe-ji qeʴlotʂu-ʁɑ,", gloss="a-DEF b-IN c-CSM d-LO")
            >>> igt[0].word
            'zəp-le:'
            >>> [gw.word for gw in igt[2:]]
            ['pe-ji', 'qeʴlotʂu-ʁɑ,']
            >>> str(igt[0, 0].morpheme)
            'zəp'
            >>> [str(gm.morpheme) for gm in igt[1, 0:]]  # All morphemes of the second word
            ['ȵi', 'ke:']
            >>> [str(gm.morpheme) for gm in igt[0:, 0]]  # First morpheme in each word
            ['zəp', 'ȵi', 'pe', 'qeʴlotʂu']
        """
        if isinstance(i, tuple):
            assert len(i) == 2
            word = self.glossed_words[i[0]]
            if isinstance(word, list):
                return [w[i[1]] for w in word]
            return word[i[1]]
        return self.glossed_words[i]

    @property
    def conformance(self) -> LGRConformance:
        """
        Alignment level of the `IGT`.
        """
        if self.is_valid(strict=True):
            return LGRConformance.MORPHEME_ALIGNED
        if self.is_valid():
            return LGRConformance.WORD_ALIGNED
        return LGRConformance.UNALIGNED

    def is_valid(self, strict: bool = False) -> bool:  # pylint: disable=C0116
        try:
            self.check(strict=strict)
            return True
        except (ValueError, AssertionError):
            return False

[docs]    def check(self, strict: bool = False, verbose: bool = False):
        """
        :param strict: If `True`, also check Rule 2: Morpheme-by-morpheme correspondence.
        """
        res = len(self.phrase) == len(self.gloss)
        if not res:
            if verbose:
                print('\t'.join(self.phrase))
                print('\t'.join(self.gloss))
            raise ValueError(
                'Rule 1 violated: Number of words does not match number of word glosses!')
        if strict:
            for i, (m, g) in enumerate(zip(self.phrase, self.gloss)):
                try:
                    GlossedWord(m, g, strict=True)
                except ValueError as e:
                    if verbose:
                        print(self.phrase[i])
                        print(self.gloss[i])
                    raise ValueError(
                        'Rule 2 violated: Number of morphemes does not match number of morpheme '
                        'glosses!') from e

    @property
    def phrase_text(self) -> str:  # pylint: disable=C0116
        return ' '.join([w or '' for w in self.phrase])

    @property
    def primary_text(self) -> str:
        """
        The primary text of the `IGT`, i.e. the phrase stripped off morpheme separators.
        """
        if self.conformance == LGRConformance.MORPHEME_ALIGNED:
            words = []
            for gw in self.glossed_words:
                words.append(''.join(gm.morpheme for gm in gw if gm.morpheme != NON_OVERT_ELEMENT))
            return ' '.join(words)
        return remove_morpheme_separators(self.phrase_text)

    @property
    def gloss_text(self) -> str:  # pylint: disable=C0116
        return ' '.join(self.gloss)


[docs]class Example(orm.Example):  # pylint: disable=R0903
    """
    A custom object class to use with
    `pycldf.orm <https://pycldf.readthedocs.io/en/latest/orm.html>`_

    This class overwrite the `pycldf.orm.Example.igt` property to return an `IGT` instance rather
    than a text string.

    .. code-block:: python

        >>> from pyigt import Example
        >>> from pycldf import Dataset
        >>> ds = Dataset.from_metadata('tests/fixtures/lgr/cldf/Generic-metadata.json')
        >>> ex = ds.objects('ExampleTable', cls=Example)
        >>> ex['2'].igt.gloss[1]
        'they-OBL-GEN'
        >>> ex['2'].igt.gloss_abbrs["OBL"]
        'oblique'
    """
    @property
    def igt(self) -> IGT:
        """Create an IGT instance based on the data of the example row."""
        tr = f"'{self.cldf.translatedText}'"
        try:
            if self.cldf.comment:
                tr += f' ({self.cldf.comment})'
        except AttributeError:  # pragma: no cover
            pass
        return IGT(
            id=self.id,
            gloss=self.cldf.gloss,
            phrase=self.cldf.analyzedWord,
            language=self.cldf.languageReference,
            translation=tr,
        )


def _clean_lexical_concept(s):
    s = re.sub(r'†\(([^)]+)\)', lambda m: m.groups()[0], s)
    return s.replace('†', '').strip()


@dataclasses.dataclass
class MorphemeContext:
    """The context of a morpheme in a corpus, i.e. the word it appears in and the IGT."""
    igt: IGT
    word: GlossedWord
    morpheme: GlossedMorpheme

    @property
    def concepts(self) -> list[tuple[str, str]]:
        """
        Categorized concepts appearing in the morpheme.
        """
        res = list(itertools.zip_longest(self.morpheme.lexical_concepts, [], fillvalue='lexicon'))
        res.extend(
            list(itertools.zip_longest(self.morpheme.grammatical_concepts, [], fillvalue='grammar'))
        )
        return res


@dataclasses.dataclass(frozen=True)
class MorphemeReference:
    """Morphemes in a corpus are identified by three numbers."""
    igt_index: int
    word_index: int
    morpheme_index: int

    def __str__(self):
        return f'{self.igt_index}:{self.word_index}:{self.morpheme_index}'

    def resolve(self, corpus) -> MorphemeContext:
        """Resolve the indices against the corpus."""
        return MorphemeContext(
            corpus[self.igt_index],
            corpus[self.igt_index, self.word_index],
            corpus[self])


@dataclasses.dataclass(frozen=True)
class Concordance:
    """A concordance maps glosses or forms to lists of occurrences."""
    grammar: dict[str, list[MorphemeReference]]
    lexicon: dict[str, list[MorphemeReference]]
    form: dict[str, list[MorphemeReference]]

    @classmethod
    def from_igts(cls, igts: collections.OrderedDict[Union[str, int], IGT]):
        """Create a concordance for the morphemes in IGT items."""
        grammar = collections.defaultdict(list)
        lexicon = collections.defaultdict(list)
        form = collections.defaultdict(list)
        for idx, igt in igts.items():
            if not igt.is_valid(strict=True):  # We ignore non-morpheme-aligned IGTs.
                continue
            for i, gw in enumerate(igt):
                for j, gm in enumerate(gw):
                    if not gm.form:
                        continue

                    ref = MorphemeReference(idx, i, j)
                    for g in gm.grammatical_concepts:
                        grammar[g].append(ref)
                    lexicon[' // '.join(gm.lexical_concepts)].append(ref)
                    form[gm.form].append(ref)
        return cls(grammar, lexicon, form)


@dataclasses.dataclass(frozen=True)
class LingPySettings:
    """Container for settings related to initializing a LingPy wordlist."""
    ref: str = 'crossid'
    lexstat: bool = True
    threshold: float = 0.4

    def get_wordlist(self, d: collections.OrderedDict[int, list[Any]]):  # pylint: disable=C0116
        if self.lexstat:
            wl = with_lingpy().LexStat(d)
            wl.cluster(method='sca', threshold=self.threshold, ref=self.ref)
        else:
            wl = with_lingpy().Wordlist(d)
            wl.add_entries('cog', 'concept,form', lambda x, y: x[y[0]] + '-' + x[y[1]])
            wl.renumber('cog', self.ref)
        return wl


[docs]class Corpus:
    """
    A Corpus is an immutable, ordered list of `IGT` instances.

    It provides access to concordance-like aggregated statistics of its texts.

    :ivar monolingual: Flag signaling whether the corpus is monolingual or contains `IGT` from \
    different object languages.
    """
    def __init__(self, igts: Iterable[IGT], fname=None, clean_lexical_concept=None):
        self.clean_lexical_concept = clean_lexical_concept or _clean_lexical_concept
        self.fname = fname
        self._igts = collections.OrderedDict([(igt.id or n, igt) for n, igt in enumerate(igts)])
        # Since changing the IGTs in the corpus is not allowed, we can compute concordances right
        # away.
        self._concordance = Concordance.from_igts(self._igts)
        self.monolingual = len(set(igt.language for igt in self._igts.values())) == 1

    @property
    def grammar(self) -> dict[str, list[MorphemeReference]]:
        """
        Maps grammatical concepts to lists of occurrences.

        .. code-block:: python

            >>> from pyigt import Corpus, IGT
            >>> igt = IGT(phrase="ni-c-chihui-lia in no-piltzin ce calli",
            ...           gloss="1SG.SUBJ-3SG.OBJ-mach-APPL DET 1SG.POSS-Sohn ein Haus")
            >>> c = Corpus([igt])
            >>> [[c[ref] for ref in c.grammar[k]] for k in c.grammar if k.startswith('1SG')]
            [[<GlossedMorpheme morpheme=ni gloss=1SG.SUBJ>],
             [<GlossedMorpheme morpheme=no gloss=1SG.POSS>]]
        """
        return self._concordance.grammar

    @property
    def lexicon(self) -> dict[str, list[MorphemeReference]]:
        """
        Maps lexical concepts to lists of occurrences.

        .. code-block:: python

            >>> from pyigt import Corpus, IGT
            >>> igt = IGT(phrase="ni-c-chihui-lia in no-piltzin ce calli",
            ...           gloss="1SG.SUBJ-3SG.OBJ-mach-APPL DET 1SG.POSS-Sohn ein Haus")
            >>> c = Corpus([igt])
            >>> [c[ref] for ref in c.lexicon['Sohn']]
            [<GlossedMorpheme morpheme=piltzin gloss=Sohn>]
        """
        return self._concordance.lexicon

    @property
    def form(self) -> dict[str, list[MorphemeReference]]:
        """
        Maps grammatical concepts to lists of occurrences.

        .. code-block:: python

            >>> from pyigt import Corpus, IGT
            >>> igt = IGT(phrase="ni-c-chihui-lia in no-piltzin ce calli",
            ...           gloss="1SG.SUBJ-3SG.OBJ-mach-APPL DET 1SG.POSS-Sohn ein Haus")
            >>> c = Corpus([igt])
            >>> [k for k in c.form]
            ['ni', 'c', 'chihui', 'lia', 'in', 'no', 'piltzin', 'ce', 'calli']
        """
        return self._concordance.form

    @staticmethod
    def get_column_names(cldf: Dataset) -> types.SimpleNamespace:  # pylint: disable=C0116
        # We lookup local column names by ontology term:
        lookup = [
            ('id', 'id'),
            ('phrase', 'analyzedWord'),
            ('gloss', 'gloss'),
            ('translation', 'translatedText'),
            ('language', 'languageReference'),
        ]
        return types.SimpleNamespace(**{
            k: cldf['ExampleTable', v].name if ('ExampleTable', v) in cldf else None
            for k, v in lookup})

[docs]    @classmethod
    def from_cldf(cls, cldf: Dataset) -> 'Corpus':
        """
        Instantiate a corpus of IGT examples from a CLDF dataset.

        :param cldf: a `pycldf.Dataset` instance.
        :param spec: a `CorpusSpec` instance, specifying how to interpret markup in the corpus.
        """
        def fix_tab(s):
            if s and '\\t' in s[0]:
                return s[0].split('\\t')
            return s

        cols = cls.get_column_names(cldf)
        igts = [
            IGT(
                id=igt[cols.id],
                gloss=fix_tab(igt[cols.gloss]),
                phrase=fix_tab(igt[cols.phrase]),
                language=igt.get(cols.language),
                translation=igt.get(cols.translation),
                properties=igt,
            )
            for igt in cldf['ExampleTable']]
        d = cldf.tablegroup._fname.parent  # pylint: disable=W0212
        return cls(igts, fname=d / str(cldf['ExampleTable'].url))

    @classmethod
    def from_stream(cls, stream) -> 'Corpus':  # pylint: disable=C0116
        cldf = Dataset(TableGroup())
        cldf.add_component('ExampleTable')

        cols = cls.get_column_names(cldf)
        igts = [
            IGT(
                id=igt[cols.id],
                gloss=igt[cols.gloss].split('\\t'),
                phrase=igt[cols.phrase].split('\\t'),
                language=igt.get(cols.language),
                properties=igt,
            )
            for igt in reader(stream.read().splitlines(), dicts=True)]
        return cls(igts)

[docs]    @classmethod
    def from_path(cls, path: Union[str, pathlib.Path]) -> 'Corpus':
        """
        Instantiate a corpus from a file path.

        :param path: Either a path to a CLDF dataset's metadata file or to a CLDF Examples \
        component as CSV file. Note that in the latter case, the file must use the default \
        column names, as defined in the CLDF ontology.
        """
        if isinstance(path, str):
            path = pathlib.Path(path)
        if path.suffix == '.json':
            return cls.from_cldf(Dataset.from_metadata(path))
        # We are given only an ExampleTable. Let's create the appropriate dataset:
        header = None
        for d in reader(path, dicts=True):
            header = list(d.keys())
            break
        ds = Dataset.from_metadata(
            pathlib.Path(pycldf.__file__).parent / 'modules' / 'Generic-metadata.json')
        ds.tablegroup._fname = path.parent / 'cldf-metadata.json'  # pylint: disable=W0212
        t = ds.add_component('ExampleTable')
        t.url = Link(path.name)
        default_cols = [col.name for col in t.tableSchema.columns]
        ds.remove_columns(t, *list(set(default_cols) - set(header)))
        ds.add_columns(t, *list(set(header) - set(default_cols)))
        return cls.from_cldf(ds)

    def __len__(self):
        return len(self._igts)

    def __iter__(self):
        return iter(self._igts.values())

    def __getitem__(self, item) -> Union[IGT, GlossedWord, GlossedMorpheme]:
        if isinstance(item, MorphemeReference):
            item = dataclasses.astuple(item)
        if not isinstance(item, tuple):
            return self._igts[item] if item in self._igts else list(self._igts.values())[item]
        if len(item) == 2:
            return self._igts[item[0]][item[1]]
        return self[item[0]][tuple(item[1:])]

    def get_stats(self) -> tuple[int, int, int]:  # pylint: disable=C0116
        return (
            len(self),
            sum(len(igt) for igt in self),
            sum(sum(len(w) for w in igt) for igt in self))

    def get_lgr_conformance_stats(self) -> dict[LGRConformance, int]:  # pylint: disable=C0116
        return collections.Counter([igt.conformance for igt in self])

    def _write_csv(self, filename, rows):
        with UnicodeWriter(filename, delimiter='\t') as w:
            w.writerows(rows)

        if not filename:
            print(w.read().decode('utf8'))

[docs]    def write_concordance(
            self,
            ctype: ConcordanceType,
            filename: Optional[Union[str, pathlib.Path]] = None,
    ):
        """
        :param ctype: `lexicon` or `grammar` or `form`.
        """
        conc = collections.defaultdict(list)
        for c, refs in getattr(self, ctype).items():
            for ref in refs:
                # We want one row per unique (form, language, concept, gloss).
                if ctype == 'form':
                    key = [c, str(self[ref].gloss), str(self[ref].gloss)]
                else:
                    key = [self[ref].form, self.clean_lexical_concept(c), c]
                conc[tuple(key + [self[ref.igt_index].language])].append(ref)

        rows = []
        h = ['ID', 'FORM', 'GLOSS', 'GLOSS_IN_SOURCE', 'OCCURRENCE', 'REF']
        if not self.monolingual:
            h.insert(1, 'LANGUAGE_ID')
        rows.append(h)
        # We order the rows by descending frequency:
        for i, (k, refs) in enumerate(
                sorted(conc.items(), key=lambda x: (-len(x[1]), x[0])), start=1):
            c = [i, k[0], k[1], k[2], len(refs), ' '.join(str(ref) for ref in refs)]
            if not self.monolingual:
                c.insert(1, k[3])
            rows.append(c)

        self._write_csv(filename, rows)

[docs]    def write_concepts(
            self,
            ctype: ConcordanceType,
            filename: Optional[Union[str, pathlib.Path]] = None,
    ):
        """
        :param ctype: `lexicon` or `grammar`.
        """
        def form(ref):
            if self.monolingual:
                return self[ref].form
            return f'{self[ref.igt_index].language}: {self[ref].form}'

        conc = []
        for c, refs in getattr(self, ctype).items():
            if c:
                igt = self[refs[0].igt_index]
                conc.append([
                    self.clean_lexical_concept(c),
                    len(refs),
                    ' // '.join(sorted(set(str(self[ref].gloss) for ref in refs))),
                    ' // '.join(sorted(set(form(ref) for ref in refs))),
                    igt.phrase_text,
                    igt.gloss_text,
                ])

        rows = [['ID', 'ENGLISH', 'OCCURRENCE', 'CONCEPT_IN_SOURCE', 'FORMS', 'PHRASE', 'GLOSS']]
        for i, row in enumerate(sorted(conc, key=lambda x: -x[1]), start=1):
            rows.append([i] + row)
        self._write_csv(filename, rows)

[docs]    def check_glosses(self, level=2):
        """Check alignment of glosses on word and morpheme level."""
        count = 1
        for idx, igt in self._igts.items():
            if not igt.is_valid() and level >= 1:
                print(f'[{idx} : first level {count}]')
                print(igt.phrase)
                print(igt.gloss)
                print('---')
                count += 1
            if level >= 2:
                for i, (w, m) in enumerate(zip(igt.phrase, igt.gloss), start=1):
                    try:
                        GlossedWord(w, m, strict=True)
                    except ValueError:
                        print(f'[{idx}:{i} : second level {count}]')
                        print(w)
                        print(m)
                        print('---')
                        count += 1

    def _iter_wordlist_items(
            self,
            doculect,
            tokenize: Callable[[str], list[str]],
    ) -> Generator[tuple[int, list[Any]], None, None]:
        yield 0, [
            'doculect',
            'concept',
            'concept_in_source',
            'concept_type',
            'form',
            'tokens',
            'occurrences',
            'word_forms',
            'gloss_forms',
            'phrase_example',
            'gloss_example',
            'references',
        ]
        idx = 1
        # Iterate over unique (cleaned concept, form, language, gloss) tuples.
        i = 0
        for form, refs in self.form.items():
            for (lid, gloss), morphrefs in itertools.groupby(
                    sorted(refs, key=lambda r: (self[r.igt_index].language, str(self[r].gloss))),
                    lambda r: (self[r.igt_index].language, str(self[r].gloss))
            ):
                morphrefs = list(morphrefs)
                morphctx = morphrefs[0].resolve(self)
                i += 1
                for concept, ctype in morphctx.concepts:
                    concept = self.clean_lexical_concept(concept)
                    tokens = tokenize(form)
                    # check tokens
                    try:
                        with_lingpy().tokens2class(tokens, 'sca')
                        check = True
                    except:  # noqa: E722, # pragma: no cover  # pylint: disable=W0702
                        check = False
                    if concept.strip() and check:
                        yield idx, [
                            doculect if self.monolingual else lid,
                            concept,
                            gloss,
                            ctype,
                            form,
                            tokens,
                            len(morphrefs),
                            ' '.join(m.form for m in morphctx.word),
                            ' '.join(m.gloss for m in morphctx.word),
                            morphctx.igt.phrase_text,
                            morphctx.igt.gloss_text,
                            ' '.join(str(ref) for ref in morphrefs)]
                        idx += 1
                    else:
                        print(
                            f'[!] Problem with "{concept}" / [{form}] [{tokens}] / {morphrefs[0]}')

[docs]    def get_wordlist(
            self,
            doculect: str = 'base',
            profile: Optional[Union[str, pathlib.Path, segments.Profile]] = None,
            lingpy_settings: LingPySettings = LingPySettings(),
    ):
        """
        Return a classical wordlist from the data.
        """
        if profile:
            profile = segments.Tokenizer(profile)

        def tokenize(profile, x):
            if profile:
                return profile('^' + x + '$', column='IPA').split()  # noqa: E731
            return with_lingpy().ipa2tokens(x)

        d = collections.OrderedDict(
            self._iter_wordlist_items(doculect, functools.partial(tokenize, profile)))
        return lingpy_settings.get_wordlist(d)

[docs]    def get_profile(self, clts=None, filename=None) -> segments.Profile:
        """
        Compute an orthography profile with LingPy's function.

        :param filename: Write the computed profile to a file in addition to returning it.
        :return: `segments.Profile` instance.
        """
        clts = clts.bipa if clts else None

        D = {0: ['doculect', 'concept', 'ipa']}  # pylint: disable=C0103
        for i, key in enumerate(self.form, start=1):
            D[i] = ['dummy', str(self[self.form[key][0]].gloss), key]
        wordlist = with_lingpy().basic.wordlist.Wordlist(D)

        if not filename:
            with tempfile.NamedTemporaryFile(delete=FileExistsError) as fp:
                pass
            p = pathlib.Path(fp.name)
        else:
            p = pathlib.Path(filename)

        with UnicodeWriter(p, delimiter='\t') as w:
            w.writerow(['Grapheme', 'IPA', 'Example', 'Count', 'Unicode'])
            for line in with_lingpy().sequence.profile.context_profile(
                    wordlist, ref='ipa', clts=clts):
                w.writerow([line[0], line[1], line[2], line[4], line[5]])

        res = segments.Profile.from_file(p)
        if not filename:
            p.unlink()
        return res