Source code for ged4py.model

# -*- coding: utf-8 -*-

"""Module containing Python in-memory model for GEDCOM data.
"""

# from __future__ import annotations

__all__ = ['make_record', 'Record', 'Pointer', 'NameRec', 'Name',
           'Date', 'Individual']

import enum
from typing import Any, Iterator, List, Optional, Union

from .detail.name import (split_name, parse_name_altree, parse_name_ancestris,
                          parse_name_myher)
from .date import DateValue


@enum.unique
class Dialect(enum.Enum):
    """Even though the structure of GEDCOM file is more or less fixed,
    interpretation of some data may vary depending on which application
    produced GEDCOM file. Constants define different known dialect which
    are handled by classes below.
    """

    DEFAULT = "DEF"
    """Constant used for default dialect (`str`)."""

    MYHERITAGE = "MYHER"  # myheritage.com
    """Constant used for myheritage.com dialect (`str`)."""

    ALTREE = "AGELONG"  # Agelong Tree (genery.com)
    """Constant used for genery.com dialect (`str`)."""

    ANCESTRIS = "ANCESTRIS"  # Ancestris (ancestris.org)
    """Constant used for ancestris.org dialect (`str`)."""


@enum.unique
class NameOrder(enum.Enum):
    """Names/Individuals can be ordered differently, e.g. by surname first,
    by given name first, or by maiden name first. This few constants define
    different ordering options.
    """
    SURNAME_GIVEN = "last+first"
    """Order by surname first, given name second."""

    GIVEN_SURNAME = "first+last"
    """Order by given name first, surname second."""

    MAIDEN_GIVEN = "maiden+first"
    """Order by maiden name (or surname) first, given name second."""

    GIVEN_MAIDEN = "first+maiden"
    """Order by given name first, maiden name (or surname) second."""


[docs]class Record: """Class representing a parsed GEDCOM record in a generic format. This is the main element of the data model, it represents records in GEDCOM files. Each GEDCOM records consists of small number of items: - level number, integer; - optional reference ID, string in format ``@identifier@``; - tag name, short string; - optional record value, arbitrary string, for pointer records the record value is the reference ID of some other record. For many record types GEDCOM specifies subordinate (nested) records with incremental level number. Record class defines an interface that makes it easier to navigate this complex hierarchy of subordinate and referenced records: - ``sub_records`` attribute contains the list of all immediate subordinate records of this record. - `sub_tag` method find subordinate record given its tag, it can do it recursively if tag name contains multiple levels separated by slashes, and it can navigate through the pointer records transparently if ``follow`` argument is ``True``. - `sub_tag_value` is a convenience method that finds a subordinate record (via `~Record.sub_tag` call) but returns value of the record instead of record itself. This simplifies handling of missing tags. - `sub_tags` returns the list of immediate subordinate records (no recursion). It is useful when multiple sub-records with the same tag can exist. There are few sub-classes of the ``Record`` class providing additional methods or facilities for specific tag types. In general it is impossible to define what constitutes value or identity of GEDCOM record, so comparison of the records does not make sense. Similarly hashing operation cannot be used on Record instances, and the class is explicitly marked as non-hashable. Client code usually does not need to create instances of this class directly, `make_record()` should be used instead. If you create an instance of this class (or its subclass) then you are responsible for filling its attributes. Attributes ---------- level : `int` Record level number xref_id : `str` Record reference ID, possibly empty. tag : `str` Tag name value : `object` Record value, possibly ``None``, for many record types value is a string or ``None``, some subclasses can define different type of record value. sub_records : `list` [ `Record` ] List of subordinate records, possibly empty. offset : `int` Record location in a file. dialect: `Dialect` GEDCOM source dialect, one of the `Dialect` enums. """ def __init__(self): self.level = None self.xref_id = None self.tag = None self.value = None self.sub_records = None self.offset = None self.dialect = None
[docs] def freeze(self) -> 'Record': """Method called by parser when updates to this record finish. Some sub-classes will override this method to implement conversion of record data to different representation. Returns ------- self : `Record` Finalized record instance. """ return self
[docs] def sub_tag(self, path, follow=True) -> Optional['Record']: """Finds and returns sub-record with given tag name. Path can be a simple tag name, in which case the first direct sub-record of this record with the matching tag is returned. Path can also consist of several tags separated by slashes, in that case sub-records are searched recursively. If ``follow`` is True then pointer records are resolved and pointed record is used instead of pointer record, this also works for all intermediate records in a path. Parameters ---------- path : `str` One or more tag names separated by slashes. follow : `bool` If True then resolve pointers. Returns ------- record : `Record` Subordinate record or ``None`` if sub-record with a given tag does not exist. """ if not self.sub_records: return None head, _, tail = path.partition('/') for rec in self.sub_records: if rec.tag != head: continue # dereference pointers if needed if follow and isinstance(rec, Pointer): rec = rec.ref if rec is not None: if tail: # recurse sub_tag = rec.sub_tag(tail, follow=follow) if sub_tag: return sub_tag else: return rec return None
[docs] def sub_tag_value(self, path, follow=True) -> Any: """Returns value of a direct sub-record. Works as `sub_tag()` but returns value of a sub-record instead of sub-record itself. Parameters ---------- path : `str` One or more tag names separated by slashes. follow : `bool` If True then resolve pointers. Returns ------- value : `object` Subordinate record value or `None` if sub-record with a given tag does not exist. """ rec = self.sub_tag(path, follow) if rec: return rec.value return None
[docs] def sub_tags(self, *tags: str, follow: bool = True) -> List['Record']: """Returns a list of sub-records matching any tag name. If no positional arguments are provided then all direct sub-records of this record are returned, pointers are resolved if ``follow`` is True. If one or more positional arguments are given then this method returns all sub-records, direct or nested, that match any of the given tags. If ``follow`` is True then pointer records are resolved and pointed record is used instead of pointer record, this also works for all intermediate records in a path. Parameters ---------- *tags : `str` Each positional argument is one or more tag names separated by slashes. follow : `bool`, optional If True then resolve pointers. Returns ------- records : `list` [ `Record` ] List of records, possibly empty. """ def _sub_tags(record: Record, tag_matches: List[List[str]], my_tag: List[str]) -> Iterator[Record]: assert record.sub_records is not None for rec in record.sub_records: sub_tag = my_tag + [rec.tag] for m in tag_matches: if m[:len(sub_tag)] == sub_tag: if follow and isinstance(rec, Pointer): rec = rec.ref if len(sub_tag) == len(m): yield rec else: yield from _sub_tags(rec, tag_matches, sub_tag) break assert self.sub_records is not None if not tags: # return all direct sub-tgas records = [x for x in self.sub_records] if follow: records = [rec.ref if isinstance(rec, Pointer) else rec for rec in records] else: records = [] # this ignores empty tags tag_matches = [tag.split("/") for tag in tags if tag] records += list(_sub_tags(self, tag_matches, [])) return records
def __repr__(self) -> str: return self.__str__() def __str__(self) -> str: value = self.value if isinstance(value, str) and len(value) > 32: value = value[:32] + "..." n_sub = 0 if self.sub_records is None else len(self.sub_records) if self.xref_id: fmt = "{0}(level={1.level}, xref_id={1.xref_id}, tag={1.tag}, " \ "value={2!r}, offset={1.offset}, #subrec={3})" else: fmt = "{0}(level={1.level}, tag={1.tag}, " \ "value={2!r}, offset={1.offset}, #subrec={3})" return fmt.format(self.__class__.__name__, self, value, n_sub) # Records cannot be hashed __hash__ = None # type: ignore
[docs]class Pointer(Record): """Sub-class of `Record` representing a pointer to a record in a GEDCOM file. This class wraps a GEDCOM pointer value and adds a ``ref`` property which retrieves pointed object. Instance of this class will be used in place of the GEDCOM pointers in the objects created by parser. Parameters ---------- parser : `ged4py.parser.GedcomReader` Instance of parser class. Attributes ---------- value : `str` Value of the GEDCOM pointer (e.g. "@I1234@") ref : `Record` Referenced GEDCOM record. """ def __init__(self, parser): Record.__init__(self) self.parser = parser self._value: Any = [] # use non-None to signify non-initialized @property def ref(self): if self._value == []: offset, _ = self.parser.xref0.get(self.value, (None, None)) if offset is None: self._value = None else: self._value = self.parser.read_record(offset) return self._value
[docs]class NameRec(Record): """Sub-class of `Record` representing the NAME record. This class adds an additional method for determining type of the name. It also redefines the type of the `value` attribute, it's type is tuple. Value tuple can contain 3 or 4 elements, if there are 4 elements then last element is a maiden name. Second element of a tuple is surname, first and third elements are pieces of the given name (this is determined entirely by how name is represented in GEDCOM file). Any of the elements can be empty string. If NAME record value is empty in GEDCOM file then all three fields of the tuple will be empty strings. Few examples:: ("John", "Smith", "") ("Mary Joan", "Smith", "", "Ivanova") # maiden name ("", "Ivanov", "Ivan Ivanovich") ("John", "Smith", "Jr.") ("", "", "") # empty NAME record Client code usually does not need to create instances of this class directly, `make_record()` should be used instead. """ def __init__(self): Record.__init__(self)
[docs] def freeze(self): """Method called by parser when updates to this record finish. Returns ------- self : `NameRec` Finalized record instance. """ # None is the same as empty string if self.value is None: self.value = "" if self.dialect in [Dialect.ALTREE]: name_tuple = parse_name_altree(self) elif self.dialect in [Dialect.MYHERITAGE]: name_tuple = parse_name_myher(self) elif self.dialect in [Dialect.ANCESTRIS]: name_tuple = parse_name_ancestris(self) else: name_tuple = split_name(self.value) self.value = name_tuple return self
@property def type(self): """Name type as defined in TYPE record. ``None`` if TYPE record is missing, otherwise string, e.g. "aka", "birth", "immigrant", "maiden", "married" (or anything else). """ # +1 TYPE <NAME_TYPE> {0:1} rec = self.sub_tag("TYPE") return rec.value if rec else None def __str__(self): return Record.__str__(self)
[docs]class Name: """Class representing "summary" of person names. Parameters ---------- names : `list` [ `NameRec` ] List of NAME records (`NameRec` instances). dialect : `Dialect` One of `Dialect` enums. Notes ----- Person in GEDCOM can have multiple NAME records, e.g. "aka" name, "maiden" name, etc. This class provides simple interface for selecting "best" name from all existing names. The algorithm for choosing best options is: - If there are no NAME records then it makes an empty name (with all empty components) - If there is only one NAME record then it is used for person name. - If there are multiple NAME records then the first record without TYPE sub-record is used, or if all records have TYPE sub-records then first NAME record is used. """ def __init__(self, names, dialect): self._names = names self._dialect = dialect self._primary: Record # "primary" name record if len(names) == 0: # make fake name record to simplify logic below self._primary = make_record(0, '', "NAME", "", [], 0, Dialect.DEFAULT).freeze() elif len(names) == 1: self._primary = names[0] else: for name in names: if not name.type: self._primary = name break else: self._primary = names[0] @property def surname(self): """Person surname (`str`)""" assert self._primary.value is not None return self._primary.value[1] @property def given(self): """Given name could include both first and middle name (`str`)""" assert self._primary.value is not None if self._primary.value[0] and self._primary.value[2]: return self._primary.value[0] + ' ' + self._primary.value[2] return self._primary.value[0] or self._primary.value[2] @property def first(self): """First name is the first part of a given name (drops middle name)""" given = self.given if given: return given.split()[0] return given @property def maiden(self): """Maiden last name, can be ``None`` (`str`)""" if self._dialect == Dialect.DEFAULT: # for default/unknown dialect try "maiden" name record first for name in self._names: if name.type == "maiden": return name.value[1] # rely on NameRec extracting it from other source if self._primary and len(self._primary.value) > 3: # type: ignore return self._primary.value[3] # type: ignore return None
[docs] def order(self, order): """Return name order key. Returns tuple with two strings that can be compared to other such tuple obtained from different name. Note that if you want locale-dependent ordering then you need to compare strings using locale-aware method (e.g. ``locale.strxfrm``). Parameters ---------- order : `NameOrder` One of the `NameOrder` enums. Returns ------- order : `tuple` [ `str` ] Tuple of two strings. """ given = self.given surname = self.surname if order in (NameOrder.MAIDEN_GIVEN, NameOrder.GIVEN_MAIDEN): surname = self.maiden or self.surname # We are collating empty names to come after non-empty, # so instead of empty we return "2" and add "1" as prefix to others given = ("1" + given) if given else "2" surname = ("1" + surname) if surname else "2" if order in (NameOrder.SURNAME_GIVEN, NameOrder.MAIDEN_GIVEN): return (surname, given) elif order in (NameOrder.GIVEN_SURNAME, NameOrder.GIVEN_MAIDEN): return (given, surname) else: raise ValueError("unexpected order: {}".format(order))
[docs] def format(self): """Format name for output. There is no single correct way to represent name, values returned from this method are only useful in limited context, e.g. for logging. Returns ------- name : `str` Formatted name representation. """ name = self._primary.value[0] # type: ignore if self.surname: if name: name += ' ' name += self.surname if self._primary.value[2]: # type: ignore if name: name += ' ' name += self._primary.value[2] # type: ignore return name
def __str__(self): fmt = "{0}({1!r})" return fmt.format(self.__class__.__name__, self.format())
[docs]class Date(Record): """Sub-class of `Record` representing the DATE record. After `freeze()` method is called by parser the `value` attribute contains instance of `ged4py.date.DateValue` class. """ def __init__(self): Record.__init__(self)
[docs] def freeze(self): """Method called by parser when updates to this record finish. Returns ------- self : `Date` Finalized record instance. """ self.value = DateValue.parse(self.value) return self
[docs]class Individual(Record): """Sub-class of `Record` representing the INDI record. INDI record represents a single person in GEDCOM. This class defines few methods that are useful shortcuts for accessing person information, such as navigation to parent records, name, etc. Client code usually does not need to create instances of this class directly, `make_record()` should be used instead. """ def __init__(self): Record.__init__(self) self._mother: Optional[Union[Record, List]] = [] # Non-None as uninitialized self._father: Optional[Union[Record, List]] = [] # Non-None as uninitialized @property def name(self): """Person name (`Name`). """ # +1 <<PERSONAL_NAME_STRUCTURE>> {0:M} return Name(self.sub_tags("NAME"), self.dialect) @property def sex(self): """Person sex, one of "M", "F", or "U" for unknown (`str`).""" # +1 SEX <SEX_VALUE> sex_rec = self.sub_tag("SEX") if sex_rec: return sex_rec.value return "U" @property def mother(self): """Parent of this individual (`Individual` or ``None``)""" if self._mother == []: self._mother = self.sub_tag("FAMC/WIFE") return self._mother @property def father(self): """Parent of this individual (`Individual` or ``None``)""" if self._father == []: self._father = self.sub_tag("FAMC/HUSB") return self._father
# maps tag names to record class _tag_class = dict(INDI=Individual, NAME=NameRec, DATE=Date)
[docs]def make_record(level, xref_id, tag, value, sub_records, offset, dialect, parser=None) -> Record: """Create `Record` instance based on parameters. Parameters ---------- level : `int` Record level number. xref_id : `str` Record reference ID, possibly empty. tag : `str` Tag name. value : `str` Record value, possibly empty. Value can be ``None``, bytes, or string object, if it is bytes then it should be decoded into strings before calling freeze(), this is normally done by the parser which knows about encodings. sub_records : `list` [ `Record` ] Initial list of subordinate records, possibly empty. List can be updated later. offset : `int` Record location in a file. dialect : `Dialect` One of `Dialect` enums. parser : `~ged4py.parser.GedcomReader` Parser instance, only needed for pointer records. Returns ------- record : `Record` Instance of `Record` (or one of its subclasses). Notes ----- This is the factory method for record instances, it can create different types of record based on tag of value: - if value has a pointer form (``@ref_id@``) then `Pointer` instance is created - if tag is "INDI" then `Individual` instance is created - if tag is "NAME" then `NameRec` instance is created - if tag is "DATE" then `Date` instance is created - otherwise `Record` instance is created Returned record is not complete, it could be updated by parser. When parser finishes updates it calls `Record.freeze()` method to finalize record construction. """ # value can be bytes or string so we check for both, 64 is code for '@' rec: Record if value and len(value) > 2 and \ ((value[0] == '@' and value[-1] == '@') or (value[0] == 64 and value[-1] == 64)): # this looks like a <pointer>, make a Pointer record rec = Pointer(parser) else: klass = _tag_class.get(tag, Record) rec = klass() rec.level = level rec.xref_id = xref_id rec.tag = tag rec.value = value rec.sub_records = sub_records rec.offset = offset rec.dialect = dialect return rec