From 4171159175d8f5592c3528ba9c97050114341627 Mon Sep 17 00:00:00 2001 From: Robert Forkel Date: Sun, 1 Mar 2026 18:35:17 +0100 Subject: [PATCH 01/17] no dependency on attrs anymore. tests pass. --- setup.cfg | 6 +- src/csvw/db.py | 139 +++++------ src/csvw/dsv_dialects.py | 115 +++------ src/csvw/jsonld.py | 10 +- src/csvw/metadata.py | 506 +++++++++++++++++++-------------------- src/csvw/utils.py | 23 +- 6 files changed, 358 insertions(+), 441 deletions(-) diff --git a/setup.cfg b/setup.cfg index 51eccc6..b54b6ba 100644 --- a/setup.cfg +++ b/setup.cfg @@ -23,12 +23,12 @@ classifiers = Natural Language :: English Operating System :: OS Independent Programming Language :: Python :: 3 - Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 Programming Language :: Python :: 3.11 Programming Language :: Python :: 3.12 Programming Language :: Python :: 3.13 + Programming Language :: Python :: 3.14 Programming Language :: Python :: Implementation :: CPython Programming Language :: Python :: Implementation :: PyPy License :: OSI Approved :: Apache Software License @@ -38,7 +38,7 @@ zip_safe = False packages = find: package_dir = = src -python_requires = >=3.8 +python_requires = >=3.9 install_requires = attrs>=18.1 isodate @@ -112,7 +112,7 @@ show_missing = true skip_covered = true [tox:tox] -envlist = py38, py39, py310, py311, py312, py313 +envlist = py39, py310, py311, py312, py313, py314 isolated_build = true skip_missing_interpreter = true diff --git a/src/csvw/db.py b/src/csvw/db.py index 8a03c62..76c75ce 100644 --- a/src/csvw/db.py +++ b/src/csvw/db.py @@ -26,69 +26,62 @@ not enforced by the database. """ import json -import typing +from typing import Optional, Union, Protocol, Callable, Any import decimal import pathlib import sqlite3 import functools import contextlib import collections - -import attr +from collections.abc import Sequence +import dataclasses import csvw from csvw.datatypes import DATATYPES from csvw.metadata import TableGroup +from .utils import optional def identity(s): return s +@dataclasses.dataclass +class DBType: + name: str + convert: Callable[[Any], Any] = identity + read: Callable[[Any], Any] = identity + + TYPE_MAP = { - 'string': ( - 'TEXT', - identity, - identity), - 'integer': ( - 'INTEGER', - identity, - identity), - 'boolean': ( - 'INTEGER', - lambda s: s if s is None else int(s), - lambda s: s if s is None else bool(s)), - 'decimal': ( - 'REAL', - lambda s: s if s is None else float(s), - lambda s: s if s is None else decimal.Decimal(s)), - 'hexBinary': ( - 'BLOB', - identity, - identity), + 'string': DBType('TEXT'), + 'integer': DBType('INTEGER'), + 'boolean': DBType('INTEGER', optional(int), optional(bool)), + 'decimal': DBType('REAL', optional(float), optional(decimal.Decimal)), + 'hexBinary': DBType('BLOB'), } -class SchemaTranslator(typing.Protocol): - def __call__(self, table: str, column: typing.Optional[str] = None) -> str: +class SchemaTranslator(Protocol): + def __call__(self, table: str, column: Optional[str] = None) -> str: ... # pragma: no cover -class ColumnTranslator(typing.Protocol): +class ColumnTranslator(Protocol): def __call__(self, column: str) -> str: ... # pragma: no cover -def quoted(*names): - return ','.join('`{0}`'.format(name) for name in names) +def quoted(*names: str) -> str: + return ','.join(f'`{name}`' for name in names) def insert(db: sqlite3.Connection, translate: SchemaTranslator, table: str, - keys: typing.Sequence[str], + keys: Sequence[str], *rows: list, - single: typing.Optional[bool] = False): + single: Optional[bool] = False): """ Insert a sequence of rows into a table. @@ -117,37 +110,35 @@ def insert(db: sqlite3.Connection, raise -def select(db: sqlite3.Connection, table: str) -> typing.Tuple[typing.List[str], typing.Sequence]: - cu = db.execute("SELECT * FROM {0}".format(quoted(table))) +def select(db: sqlite3.Connection, table: str) -> tuple[list[str], Sequence]: + cu = db.execute(f"SELECT * FROM {quoted(table)}") cols = [d[0] for d in cu.description] return cols, list(cu.fetchall()) -@attr.s +@dataclasses.dataclass class ColSpec: """ A `ColSpec` captures sufficient information about a :class:`csvw.Column` for the DB schema. """ - name = attr.ib() - csvw_type = attr.ib(default='string', converter=lambda s: s if s else 'string') - separator = attr.ib(default=None) - db_type = attr.ib(default=None) - convert = attr.ib(default=None) - read = attr.ib(default=None) - required = attr.ib(default=False) - csvw = attr.ib(default=None) - - def __attrs_post_init__(self): + name: str + csvw_type: str = 'string' + separator: str = None + db_type: DBType = None + required: bool = False + csvw: str = None + + def __post_init__(self): + self.csvw_type = self.csvw_type or 'string' if self.csvw_type in TYPE_MAP: - self.db_type, self.convert, self.read = TYPE_MAP[self.csvw_type] + self.db_type = TYPE_MAP[self.csvw_type] else: - self.db_type = 'TEXT' - self.convert = DATATYPES[self.csvw_type].to_string - self.read = DATATYPES[self.csvw_type].to_python + self.db_type = DBType( + 'TEXT', DATATYPES[self.csvw_type].to_string, DATATYPES[self.csvw_type].to_python) if self.separator and self.db_type != 'TEXT': - self.db_type = 'TEXT' + self.db_type = DBType('TEXT', self.db_type.convert, self.db_type.read) - def check(self, translate: ColumnTranslator) -> typing.Optional[str]: + def check(self, translate: ColumnTranslator) -> Optional[str]: """ We try to convert as many data constraints as possible into SQLite CHECK constraints. @@ -184,15 +175,13 @@ def check(self, translate: ColumnTranslator) -> typing.Optional[str]: def sql(self, translate: ColumnTranslator) -> str: _check = self.check(translate) - return '`{0}` {1}{2}{3}'.format( - translate(self.name), - self.db_type, - ' NOT NULL' if self.required else '', - ' CHECK ({0})'.format(_check) if _check else '') + null_constraint = ' NOT NULL' if self.required else '' + check_constraint = f' CHECK ({_check})' if _check else '' + return f'`{translate(self.name)}` {self.db_type.name}{null_constraint}{check_constraint}' -@attr.s -class TableSpec(object): +@dataclasses.dataclass +class TableSpec: """ A `TableSpec` captures sufficient information about a :class:`csvw.Table` for the DB schema. @@ -205,16 +194,16 @@ class TableSpec(object): .. seealso:: ``_ """ - name = attr.ib() - columns = attr.ib(default=attr.Factory(list)) - foreign_keys = attr.ib(default=attr.Factory(list)) - many_to_many = attr.ib(default=attr.Factory(collections.OrderedDict)) - primary_key = attr.ib(default=None) + name: str + columns: list[ColSpec] = dataclasses.field(default_factory=list) + foreign_keys: list = dataclasses.field(default_factory=list) + many_to_many: collections.OrderedDict = dataclasses.field(default_factory=collections.OrderedDict) + primary_key: Optional[list[str]] = None @classmethod def from_table_metadata(cls, table: csvw.Table, - drop_self_referential_fks: typing.Optional[bool] = True) -> 'TableSpec': + drop_self_referential_fks: Optional[bool] = True) -> 'TableSpec': """ Create a `TableSpec` from the schema description of a `csvw.metadata.Table`. @@ -305,7 +294,7 @@ def sql(self, translate: SchemaTranslator) -> str: def schema(tg: csvw.TableGroup, - drop_self_referential_fks: typing.Optional[bool] = True) -> typing.List[TableSpec]: + drop_self_referential_fks: Optional[bool] = True) -> list[TableSpec]: """ Convert the table and column descriptions of a `TableGroup` into specifications for the DB schema. @@ -365,9 +354,9 @@ class Database(object): def __init__( self, tg: TableGroup, - fname: typing.Optional[typing.Union[pathlib.Path, str]] = None, - translate: typing.Optional[SchemaTranslator] = None, - drop_self_referential_fks: typing.Optional[bool] = True, + fname: Optional[Union[pathlib.Path, str]] = None, + translate: Optional[SchemaTranslator] = None, + drop_self_referential_fks: Optional[bool] = True, ): self.translate = translate or Database.name_translator self.fname = pathlib.Path(fname) if fname else None @@ -380,11 +369,11 @@ def init_schema(self, tg, drop_self_referential_fks=True): self.tg, drop_self_referential_fks=drop_self_referential_fks) if self.tg else [] @property - def tdict(self) -> typing.Dict[str, TableSpec]: + def tdict(self) -> dict[str, TableSpec]: return {t.name: t for t in self.tables} @staticmethod - def name_translator(table: str, column: typing.Optional[str] = None) -> str: + def name_translator(table: str, column: Optional[str] = None) -> str: """ A callable with this signature can be passed into DB creation to control the names of the schema objects. @@ -396,7 +385,7 @@ def name_translator(table: str, column: typing.Optional[str] = None) -> str: # By default, no translation is done: return column or table - def connection(self) -> typing.Union[sqlite3.Connection, contextlib.closing]: + def connection(self) -> Union[sqlite3.Connection, contextlib.closing]: if self.fname: return contextlib.closing(sqlite3.connect(str(self.fname))) if not self._connection: @@ -420,7 +409,7 @@ def select_many_to_many(self, db, table, context) -> dict: r[0]: [(k, v) if context is None else k for k, v in zip(r[1].split(), r[2].split('||'))] for r in cu.fetchall()} - def separator(self, tname: str, cname: str) -> typing.Optional[str]: + def separator(self, tname: str, cname: str) -> Optional[str]: """ :return: separator for the column specified by db schema names `tname` and `cname`. """ @@ -430,11 +419,11 @@ def separator(self, tname: str, cname: str) -> typing.Optional[str]: if self.translate(name, col.name) == cname: return col.separator - def split_value(self, tname, cname, value) -> typing.Union[typing.List[str], str, None]: + def split_value(self, tname, cname, value) -> Union[list[str], str, None]: sep = self.separator(tname, cname) return (value or '').split(sep) if sep else value - def read(self) -> typing.Dict[str, typing.List[typing.OrderedDict]]: + def read(self) -> dict[str, list[collections.OrderedDict]]: """ :return: A `dict` where keys are SQL table names corresponding to CSVW tables and values \ are lists of rows, represented as dicts where keys are the SQL column names. @@ -453,7 +442,7 @@ def read(self) -> typing.Dict[str, typing.List[typing.OrderedDict]]: for col in table.columns: convert[self.translate(tname, col.name)] = [col.name, identity] if col.csvw_type in TYPE_MAP: - convert[self.translate(tname, col.name)][1] = TYPE_MAP[col.csvw_type][2] + convert[self.translate(tname, col.name)][1] = TYPE_MAP[col.csvw_type].convert else: convert[self.translate(tname, col.name)][1] = \ DATATYPES[col.csvw_type].to_python @@ -564,11 +553,11 @@ def write(self, *, force=False, _exists_ok=False, _skip_extra=False, **items): # Note: This assumes list-valued columns are of datatype string! if col.csvw_type == 'string': v = (col.separator or ';').join( - col.convert(vv) or '' for vv in v) + col.db_type.convert(vv) or '' for vv in v) else: v = json.dumps(v) else: - v = col.convert(v) if v is not None else None + v = col.db_type.convert(v) if v is not None else None if i == 0: keys.append(col.name) values.append(v) diff --git a/src/csvw/dsv_dialects.py b/src/csvw/dsv_dialects.py index e35f391..fead1a1 100644 --- a/src/csvw/dsv_dialects.py +++ b/src/csvw/dsv_dialects.py @@ -9,9 +9,10 @@ - ``_ - ``_ """ -import attr +import typing import warnings import functools +import dataclasses from . import utils @@ -22,15 +23,6 @@ } -# FIXME: replace with attrs.validators.ge(0) from attrs 21.3.0 -def _non_negative(instance, attribute, value): - if value < 0: # pragma: no cover - raise ValueError('{0} is not a valid {1}'.format(value, attribute.name)) - - -non_negative_int = [attr.validators.instance_of(int), _non_negative] - - def convert_encoding(s): s = utils.converter(str, 'utf-8', s) try: @@ -41,82 +33,47 @@ def convert_encoding(s): return 'utf-8' -@attr.s -class Dialect(object): +@dataclasses.dataclass +class Dialect: """ A CSV dialect specification. .. seealso:: ``_ """ - encoding = attr.ib( - default='utf-8', - converter=convert_encoding, - validator=attr.validators.instance_of(str)) - - lineTerminators = attr.ib( - converter=functools.partial(utils.converter, list, ['\r\n', '\n']), - default=attr.Factory(lambda: ['\r\n', '\n'])) - - quoteChar = attr.ib( - converter=functools.partial(utils.converter, str, '"', allow_none=True), - default='"', - ) - - doubleQuote = attr.ib( - default=True, - converter=functools.partial(utils.converter, bool, True), - validator=attr.validators.instance_of(bool)) - - skipRows = attr.ib( - default=0, - converter=functools.partial(utils.converter, int, 0, cond=lambda s: s >= 0), - validator=non_negative_int) - - commentPrefix = attr.ib( - default='#', - converter=functools.partial(utils.converter, str, '#', allow_none=True), - validator=attr.validators.optional(attr.validators.instance_of(str))) - - header = attr.ib( - default=True, - converter=functools.partial(utils.converter, bool, True), - validator=attr.validators.instance_of(bool)) - - headerRowCount = attr.ib( - default=1, - converter=functools.partial(utils.converter, int, 1, cond=lambda s: s >= 0), - validator=non_negative_int) - - delimiter = attr.ib( - default=',', - converter=functools.partial(utils.converter, str, ','), - validator=attr.validators.instance_of(str)) - - skipColumns = attr.ib( - default=0, - converter=functools.partial(utils.converter, int, 0, cond=lambda s: s >= 0), - validator=non_negative_int) - - skipBlankRows = attr.ib( - default=False, - converter=functools.partial(utils.converter, bool, False), - validator=attr.validators.instance_of(bool)) - - skipInitialSpace = attr.ib( - default=False, - converter=functools.partial(utils.converter, bool, False), - validator=attr.validators.instance_of(bool)) - - trim = attr.ib( - default='false', - validator=attr.validators.in_(['true', 'false', 'start', 'end']), - converter=lambda v: functools.partial( - utils.converter, - (str, bool), 'false')('{0}'.format(v).lower() if isinstance(v, bool) else v)) + encoding: str = 'utf-8' + lineTerminators: list[str] = dataclasses.field(default_factory=lambda: ['\r\n', '\n']) + quoteChar: str = '"' + doubleQuote: bool = True + skipRows: int = 0 + commentPrefix: str = '#' + header: bool = True + headerRowCount: int = 1 + delimiter: str = ',' + skipColumns: int = 0 + skipBlankRows: bool = False + skipInitialSpace: bool = False + trim: typing.Literal['true', 'false', 'start', 'end'] = 'false' + + def __post_init__(self): + self.encoding = convert_encoding(self.encoding) + self.line_terminators = utils.converter(list, ['\r\n', '\n'], self.line_terminators) + self.quoteChar = utils.converter(str, '"', self.quoteChar, allow_none=True) + self.doubleQuote = utils.converter(bool, True, self.doubleQuote) + self.skipRows = utils.converter(int, 0, self.skipRows, cond=lambda s: s >= 0) + self.commentPrefix = utils.converter(str, '#', self.commentPrefix, allow_none=True) + self.header = utils.converter(bool, True, self.header) + self.headerRowCount = utils.converter( + int, 1, self.headerRowCount, cond=lambda s: s >= 0) + self.delimiter = utils.converter(str, ',', self.delimiter) + self.skipColumns = utils.converter(int, 0, self.skipColumns, cond=lambda s: s >= 0) + self.skipBlankRows = utils.converter(bool, False, self.skipBlankRows) + self.skipInitialSpace = utils.converter(bool, False, self.skipInitialSpace) + self.trim = utils.converter((str, bool), 'false', str(self.trim).lower() if isinstance(self.trim, bool) else self.trim) + assert self.trim in ['true', 'false', 'start', 'end'], 'invalid trim' def updated(self, **kw): - res = self.__class__(**attr.asdict(self)) + res = self.__class__(**dataclasses.asdict(self)) for k, v in kw.items(): setattr(res, k, v) return res @@ -133,7 +90,9 @@ def line_terminators(self): @functools.cached_property def trimmer(self): return { + True: lambda s: s.strip(), 'true': lambda s: s.strip(), + False: lambda s: s, 'false': lambda s: s, 'start': lambda s: s.lstrip(), 'end': lambda s: s.rstrip() diff --git a/src/csvw/jsonld.py b/src/csvw/jsonld.py index 92daa46..8087b4e 100644 --- a/src/csvw/jsonld.py +++ b/src/csvw/jsonld.py @@ -6,8 +6,8 @@ import pathlib import datetime import collections +import dataclasses -import attr from rdflib import Graph, URIRef, Literal from rfc3986 import URIReference from isodate.duration import Duration @@ -50,14 +50,14 @@ def format_value(value, col): return value -@attr.s +@dataclasses.dataclass class Triple: """ A table cell's data as RDF triple. """ - about = attr.ib() - property = attr.ib() - value = attr.ib() + about: str + property: str + value: str def as_rdflib_triple(self): return ( diff --git a/src/csvw/metadata.py b/src/csvw/metadata.py index 4d485e2..0539afc 100644 --- a/src/csvw/metadata.py +++ b/src/csvw/metadata.py @@ -7,13 +7,14 @@ .. seealso:: https://www.w3.org/TR/tabular-metadata/ """ +import datetime import io import re import json import shutil import decimal import pathlib -import typing +from typing import Optional, Union, Any, Literal import zipfile import operator import warnings @@ -21,10 +22,11 @@ import itertools import contextlib import collections +from collections.abc import Iterable, Generator +import dataclasses from urllib.parse import urljoin, urlparse, urlunparse from language_tags import tags -import attr import requests import uritemplate @@ -175,6 +177,9 @@ virtual""".split() is_url = utils.is_url +OrderedType = Union[ + int, float, decimal.Decimal, datetime.date, datetime.datetime, datetime.timedelta] + class Invalid: pass @@ -183,7 +188,7 @@ class Invalid: INVALID = Invalid() -@attr.s +@dataclasses.dataclass class Dialect(BaseDialect): """ The spec is ambiguous regarding a default for the commentPrefix property: @@ -204,10 +209,7 @@ class Dialect(BaseDialect): So, in order to pass the number formatting tests, with column names like `##.#`, we chose the second reading - i.e. by default no rows are treated as comments. """ - commentPrefix = attr.ib( - default=None, - converter=functools.partial(utils.converter, str, None, allow_none=True), - validator=attr.validators.optional(attr.validators.instance_of(str))) + commentPrefix: str = None def json_open(filename, mode='r', encoding='utf-8'): @@ -215,7 +217,7 @@ def json_open(filename, mode='r', encoding='utf-8'): return io.open(filename, mode, encoding=encoding) -def get_json(fname) -> typing.Union[list, dict]: +def get_json(fname) -> Union[list, dict]: fname = str(fname) if is_url(fname): return requests.get(fname).json(object_pairs_hook=collections.OrderedDict) @@ -254,26 +256,13 @@ def asdict(self, **kw): return '{}'.format(self) -def uri_template_property(): - """ - - Note: We do not currently provide support for supplying the "_" variables like "_row" - when expanding a URI template. - - .. seealso:: http://w3c.github.io/csvw/metadata/#uri-template-properties - """ - def converter_uriTemplate(v): - if v is None: - return None - if not isinstance(v, str): - warnings.warn('Invalid value for aboutUrl property') - return INVALID - return URITemplate(v) - - return attr.ib( - default=None, - validator=attr.validators.optional(attr.validators.instance_of((URITemplate, Invalid))), - converter=converter_uriTemplate) +def convert_uri_template(v): + if v is None: + return None + if not isinstance(v, str): + warnings.warn('Invalid value for Url property') + return INVALID + return URITemplate(v) class Link: @@ -282,7 +271,7 @@ class Link: .. seealso:: http://w3c.github.io/csvw/metadata/#link-properties """ - def __init__(self, string: typing.Union[str, pathlib.Path]): + def __init__(self, string: Union[str, pathlib.Path]): if not isinstance(string, (str, pathlib.Path)): raise ValueError('Invalid value for link property') self.string = string @@ -312,13 +301,6 @@ def resolve(self, base): return urljoin(base, self.string) -def link_property(required=False): - return attr.ib( - default=None, - validator=attr.validators.optional(attr.validators.instance_of(Link)), - converter=lambda v: v if v is None else Link(v)) - - class NaturalLanguage(collections.OrderedDict): """ @@ -430,24 +412,26 @@ def valid_common_property(v): return v -@attr.s +@dataclasses.dataclass class DescriptionBase: """Container for - common properties (see http://w3c.github.io/csvw/metadata/#common-properties) - @-properties. """ - common_props = attr.ib(default=attr.Factory(dict)) - at_props = attr.ib(default=attr.Factory(dict)) + common_props: dict[str, Any] = dataclasses.field(default_factory=dict) + at_props: dict[str, Any] = dataclasses.field(default_factory=dict) @classmethod - def partition_properties(cls, - d: typing.Union[dict, typing.Any], - type_name: typing.Optional[str] = None, - strict=True) -> typing.Union[dict, None]: + def partition_properties( + cls, + d: Union[dict, Any], + type_name: Optional[str] = None, + strict=True + ) -> Union[dict, None]: if d and not isinstance(d, dict): return - fields = attr.fields_dict(cls) + fields = {f.name: f for f in dataclasses.fields(cls)} type_name = type_name or cls.__name__ c, a, dd = {}, {}, {} for k, v in (d or {}).items(): @@ -499,14 +483,7 @@ def asdict(self, omit_defaults=True) -> dict: if (k == 'null' or (v not in ([], {})))) -def optional_int(): - return attr.ib( - default=None, - validator=attr.validators.optional(attr.validators.instance_of(int)), - converter=lambda v: v if v is None else int(v)) - - -@attr.s +@dataclasses.dataclass class Datatype(DescriptionBase): """ A datatype description @@ -517,44 +494,27 @@ class Datatype(DescriptionBase): .. seealso:: ``_ """ - base = attr.ib( - default=None, - converter=functools.partial( + base: str = None + format: Optional[str] = None + length: Optional[int] = None + minLength: Optional[int] = None + maxLength: Optional[int] = None + minimum: OrderedType = None + maximum: OrderedType = None + minInclusive: Optional[bool] = None + maxInclusive: Optional[bool] = None + minExclusive: Optional[bool] = None + maxExclusive: Optional[bool] = None + + def __post_init__(self): + self.base = functools.partial( utils.converter, - str, 'string', allow_none=True, cond=lambda ss: ss is None or ss in DATATYPES), - validator=attr.validators.optional(attr.validators.in_(DATATYPES))) - format = attr.ib(default=None) - length = optional_int() - minLength = optional_int() - maxLength = optional_int() - minimum = attr.ib(default=None) - maximum = attr.ib(default=None) - minInclusive = attr.ib(default=None) - maxInclusive = attr.ib(default=None) - minExclusive = attr.ib(default=None) - maxExclusive = attr.ib(default=None) - - @classmethod - def fromvalue(cls, v: typing.Union[str, dict, 'Datatype']) -> 'Datatype': - """ - :param v: Initialization data for `cls`; either a single string that is the main datatype \ - of the values of the cell or a datatype description object, i.e. a `dict` or a `cls` \ - instance. - :return: An instance of `cls` - """ - if isinstance(v, str): - return cls(base=v) - - if isinstance(v, dict): - v.setdefault('base', 'string') - return cls(**cls.partition_properties(v)) - - if isinstance(v, cls): - return v - - raise ValueError(v) - - def __attrs_post_init__(self): + str, + 'string', + allow_none=True, + cond=lambda ss: ss is None or ss in DATATYPES)(self.base) + for att in ('length', 'maxLength', 'minLength'): + setattr(self, att, utils.optional(int)(getattr(self, att))) for attr_ in [ 'minimum', 'maximum', 'minInclusive', 'maxInclusive', 'minExclusive', 'maxExclusive' ]: @@ -618,6 +578,26 @@ def __attrs_post_init__(self): self.format = None warnings.warn('Invalid number pattern') + @classmethod + def fromvalue(cls, v: Union[str, dict, 'Datatype']) -> 'Datatype': + """ + :param v: Initialization data for `cls`; either a single string that is the main datatype \ + of the values of the cell or a datatype description object, i.e. a `dict` or a `cls` \ + instance. + :return: An instance of `cls` + """ + if isinstance(v, str): + return cls(base=v) + + if isinstance(v, dict): + v.setdefault('base', 'string') + return cls(**cls.partition_properties(v)) + + if isinstance(v, cls): + return v + + raise ValueError(v) + def asdict(self, omit_defaults=True): res = DescriptionBase.asdict(self, omit_defaults=omit_defaults) for attr_ in [ @@ -677,22 +657,7 @@ def read(self, v): return self.validate(self.parse(v)) -def converter_null(v): - res = [] if v is None else (v if isinstance(v, list) else [v]) - if not all(isinstance(vv, str) for vv in res): - warnings.warn('Invalid null property') - return [""] - return res - - -def converter_lang(v): - if not tags.check(v): - warnings.warn('Invalid language tag') - return 'und' - return v - - -@attr.s +@dataclasses.dataclass class Description(DescriptionBase): """Adds support for inherited properties. @@ -703,35 +668,44 @@ class Description(DescriptionBase): # reference to the containing object. Note that this attribute is ignored when judging # equality between objects. Thus, identically specified columns of different tables will be # considered equal. - _parent = attr.ib(default=None, repr=False, eq=False) - - aboutUrl = uri_template_property() - datatype = attr.ib( - default=None, - converter=lambda v: v if not v else Datatype.fromvalue(v)) - default = attr.ib( - default="", - converter=functools.partial(utils.converter, str, "", allow_list=False), - ) - lang = attr.ib(default="und", converter=converter_lang) - null = attr.ib(default=attr.Factory(lambda: [""]), converter=converter_null) - ordered = attr.ib( - default=None, - converter=functools.partial(utils.converter, bool, False, allow_none=True), - ) - propertyUrl = uri_template_property() - required = attr.ib(default=None) - separator = attr.ib( - converter=functools.partial(utils.converter, str, None, allow_none=True), - default=None, - ) - textDirection = attr.ib( - default=None, - converter=functools.partial( - utils.converter, - str, None, allow_none=True, cond=lambda v: v in [None, "ltr", "rtl", "auto", "inherit"]) - ) - valueUrl = uri_template_property() + _parent: Optional[DescriptionBase] = None + + aboutUrl: Optional[Union[URITemplate, Invalid]] = None + datatype: Optional[Datatype] = None + default: Optional[Union[str, list[str]]] = "" + lang: str = "und" + null: list[str] = dataclasses.field(default_factory=lambda: [""]) + ordered: Optional[bool] = None + propertyUrl: Optional[Union[URITemplate, Invalid]] = None + required: Optional[bool] = None + separator: Optional[str] = None + textDirection: Optional[Literal["ltr", "rtl", "auto", "inherit"]] = None + valueUrl: Optional[Union[URITemplate, Invalid]] = None + + def __post_init__(self): + if self.datatype is not None: + self.datatype = Datatype.fromvalue(self.datatype) + self.default = utils.converter(str, "", self.default, allow_list=False) + if not tags.check(self.lang): + warnings.warn('Invalid language tag') + self.lang = 'und' + + self.null = [] if self.null is None else \ + (self.null if isinstance(self.null, list) else [self.null]) + if not all(isinstance(vv, str) for vv in self.null): + warnings.warn('Invalid null property') + self.null = [""] + self.ordered = utils.converter( bool, False, self.ordered, allow_none=True) + self.separator = utils.converter( str, None, self.separator, allow_none=True) + self.textDirection = utils.converter( + str, + None, + self.textDirection, + allow_none=True, + cond=lambda v: v in [None, "ltr", "rtl", "auto", "inherit"]) + for att in ('valueUrl', 'aboutUrl', 'propertyUrl'): + if getattr(self, att) is not None: + setattr(self, att, convert_uri_template(getattr(self, att))) def inherit(self, attr): v = getattr(self, attr) @@ -747,15 +721,7 @@ def inherit_null(self): return self.null -def converter_titles(v): - try: - return v if v is None else NaturalLanguage(v) - except ValueError: - warnings.warn('Invalid titles property') - return None - - -@attr.s +@dataclasses.dataclass class Column(Description): """ A column description is an object that describes a single column. @@ -766,25 +732,34 @@ class Column(Description): .. seealso:: ``_ """ - name = attr.ib( - default=None, - converter=functools.partial(utils.converter, str, None, allow_none=True) - ) - suppressOutput = attr.ib( - default=False, - converter=functools.partial(utils.converter, bool, False)) - titles = attr.ib( - default=None, - validator=attr.validators.optional(attr.validators.instance_of(NaturalLanguage)), - converter=converter_titles) - virtual = attr.ib(default=False, converter=functools.partial(utils.converter, bool, False)) - _number = attr.ib(default=None, repr=False) + name: str = None + suppressOutput: bool = False + titles: Optional[NaturalLanguage] = None + virtual: bool = False + _number: Optional[int] = None + + def __post_init__(self): + super().__post_init__() + self.name = utils.converter( str, None, self.name, allow_none=True) + self.suppressOutput = utils.converter( bool, False, self.suppressOutput) + + if self.titles is not None: + try: + self.titles = NaturalLanguage(self.titles) + except ValueError: + warnings.warn('Invalid titles property') + self.titles = None + + self.virtual = utils.converter( bool, False, self.virtual) def __str__(self): return self.name or \ (self.titles and self.titles.getfirst()) or \ '_col.{}'.format(self._number) + def __eq__(self, other): + return self.asdict() == other.asdict() + def has_title(self, v): if self.name and self.name == v: return True @@ -852,30 +827,33 @@ def fmt(v): return fmt(v) -def column_reference(): - return attr.ib( - default=None, - validator=attr.validators.optional(attr.validators.instance_of(list)), - converter=lambda v: v if isinstance(v, list) or v is None else [v]) - - -@attr.s +@dataclasses.dataclass class Reference: + resource: Optional[Link] = None + schemaReference: Optional[Link] = None + columnReference: Optional[list[str]] = None + + def __post_init__(self): + if self.resource is not None: + if self.schemaReference is not None: + raise ValueError(self) + self.resource = Link(self.resource) - resource = link_property() - schemaReference = link_property() - columnReference = column_reference() + if self.schemaReference is not None: + self.schemaReference = Link(self.schemaReference) - def __attrs_post_init__(self): - if self.resource is not None and self.schemaReference is not None: - raise ValueError(self) + if isinstance(self.columnReference, str): + self.columnReference = [self.columnReference] -@attr.s +@dataclasses.dataclass class ForeignKey: + columnReference: Optional[list[str]] = None + reference: Optional[Reference] = None - columnReference = column_reference() - reference = attr.ib(default=None) + def __post_init__(self): + if isinstance(self.columnReference, str): + self.columnReference = [self.columnReference] @classmethod def fromdict(cls, d): @@ -895,17 +873,7 @@ def asdict(self, **kw): return res -def converter_foreignKeys(v): - res = [] - for d in functools.partial(utils.converter, dict, None)(v): - try: - res.append(ForeignKey.fromdict(d)) - except TypeError: - warnings.warn('Invalid foreignKeys spec') - return res - - -@attr.s +@dataclasses.dataclass class Schema(Description): """ A schema description is an object that encodes the information about a schema, which describes @@ -916,21 +884,33 @@ class Schema(Description): .. seealso:: ``_ """ - columns = attr.ib( - default=attr.Factory(list), - converter=lambda v: [ - Column.fromvalue(c) for c in functools.partial(utils.converter, dict, None)( - functools.partial(utils.converter, list, [])(v))]) - foreignKeys = attr.ib( - default=attr.Factory(list), - converter=lambda v: [] if v is None else converter_foreignKeys(v)) - primaryKey = column_reference() - rowTitles = attr.ib( - default=attr.Factory(list), - converter=lambda v: v if isinstance(v, list) else [v], - ) - - def __attrs_post_init__(self): + columns: list[Column] = dataclasses.field(default_factory=list) + foreignKeys: list[ForeignKey] = dataclasses.field(default_factory=list) + primaryKey: Optional[list[str]] = None + rowTitles: list[str] = dataclasses.field(default_factory=list) + + def __post_init__(self): + super().__post_init__() + self.columns = [ + Column.fromvalue(c) for c in + utils.converter(dict, None, utils.converter(list, [], self.columns))] + for i, col in enumerate(self.columns): + col._number = i + 1 + if self.foreignKeys is None: + self.foreignKeys = [] + else: + res = [] + for d in utils.converter(dict, None, self.foreignKeys): + try: + res.append(ForeignKey.fromdict(d)) + except TypeError: + warnings.warn('Invalid foreignKeys spec') + self.foreignKeys = res + + if self.primaryKey is not None and not isinstance(self.primaryKey, list): + self.primaryKey = [self.primaryKey] + self.rowTitles = self.rowTitles if isinstance(self.rowTitles, list) else [self.rowTitles] + virtual, seen, names = False, set(), set() for i, col in enumerate(self.columns): if col.name and (col.name.startswith('_') or re.search(r'\s', col.name)): @@ -948,7 +928,7 @@ def __attrs_post_init__(self): names.add(col.name) seen.add(col.header) col._parent = self - col._number = i + 1 + #col._number = i + 1 for colref in self.primaryKey or []: col = self.columndict.get(colref) if col and not col.name: @@ -997,14 +977,7 @@ def dialect_props(d): return partitioned -def valid_transformations(instance, attribute, value): - if not isinstance(value, list): - warnings.warn('Invalid transformations property') - for tr in value: - Description.partition_properties(tr, type_name='Template') - - -@attr.s +@dataclasses.dataclass class TableLike(Description): """ A CSVW description object as encountered "in the wild", i.e. identified by URL on the web or @@ -1031,29 +1004,32 @@ class TableLike(Description): and `URI template properties `_ (see :meth:`~TableLike.expand`). """ - dialect = attr.ib( - default=None, - converter=lambda v: v if (v is None or isinstance(v, str)) - else Dialect(**dialect_props(v))) - notes = attr.ib(default=attr.Factory(list)) - tableDirection = attr.ib( - default='auto', - converter=functools.partial( - utils.converter, str, 'auto', cond=lambda s: s in ['rtl', 'ltr', 'auto']), - validator=attr.validators.in_(['rtl', 'ltr', 'auto'])) - tableSchema = attr.ib( - default=None, - converter=lambda v: Schema.fromvalue(v)) - transformations = attr.ib( - validator=valid_transformations, - default=attr.Factory(list), - ) - url = link_property() - _fname = attr.ib(default=None) # The path of the metadata file. - - def __attrs_post_init__(self): + dialect: Optional[Union[str, Dialect]] = None + notes: list[str] = dataclasses.field(default_factory=list) + tableDirection: Literal['rtl', 'ltr', 'auto'] = 'auto' + tableSchema: Optional[Schema] = None + transformations: list = dataclasses.field(default_factory=list) + url: Optional[Link] = None + _fname: Union[str, pathlib.Path] = None # The path of the metadata file. + + def __post_init__(self): + super().__post_init__() if isinstance(self.dialect, str): self.dialect = Dialect(**dialect_props(get_json(Link(self.dialect).resolve(self.base)))) + elif self.dialect is not None: + self.dialect = Dialect(**dialect_props(self.dialect)) + + self.tableDirection = utils.converter( + str, 'auto', self.tableDirection, cond=lambda s: s in ['rtl', 'ltr', 'auto']) + self.tableSchema = Schema.fromvalue(self.tableSchema) + + if not isinstance(self.transformations, list): + warnings.warn('Invalid transformations property') + for tr in self.transformations: + Description.partition_properties(tr, type_name='Template') + if self.url is not None: + self.url = Link(self.url) + if self.tableSchema and not (isinstance(self.tableSchema, str)): self.tableSchema._parent = self if 'id' in self.at_props and self.at_props['id'] is None: @@ -1078,7 +1054,7 @@ def get_column(self, spec): return self.tableSchema.get_column(spec) if self.tableSchema else None @classmethod - def from_file(cls, fname: typing.Union[str, pathlib.Path], data=None) -> 'TableLike': + def from_file(cls, fname: Union[str, pathlib.Path], data=None) -> 'TableLike': """ Instantiate a CSVW Table or TableGroup description from a metadata file. """ @@ -1102,7 +1078,7 @@ def from_url(cls, url: str, data=None) -> 'TableLike': res = cls.fromvalue(data) return res - def to_file(self, fname: typing.Union[str, pathlib.Path], omit_defaults=True) -> pathlib.Path: + def to_file(self, fname: Union[str, pathlib.Path], omit_defaults=True) -> pathlib.Path: """ Write a CSVW Table or TableGroup description as JSON object to a local file. @@ -1117,7 +1093,7 @@ def to_file(self, fname: typing.Union[str, pathlib.Path], omit_defaults=True) -> return fname @property - def base(self) -> typing.Union[str, pathlib.Path]: + def base(self) -> Union[str, pathlib.Path]: """ The "base" to resolve relative links against. """ @@ -1176,7 +1152,7 @@ def expand(self, tmpl: URITemplate, row: dict, _row, _name=None, qname=False) -> return res -@attr.s +@dataclasses.dataclass class Table(TableLike): """ A table description is an object that describes a table within a CSV file. @@ -1191,7 +1167,7 @@ class Table(TableLike): .. seealso:: ``_ """ - suppressOutput = attr.ib(default=False) + suppressOutput: bool = False _comments = [] def add_foreign_key(self, colref, ref_resource, ref_colref): @@ -1211,24 +1187,24 @@ def add_foreign_key(self, colref, ref_resource, ref_colref): 'reference': {'resource': ref_resource, 'columnReference': ref_colref} })) - def __attrs_post_init__(self): - TableLike.__attrs_post_init__(self) + def __post_init__(self): + TableLike.__post_init__(self) if not self.url: raise ValueError('url property is required for Tables') @property - def local_name(self) -> typing.Union[str, None]: + def local_name(self) -> Union[str, None]: return self.url.string if self.url else None def _get_dialect(self) -> Dialect: return self.dialect or (self._parent and self._parent.dialect) or Dialect() def write(self, - items: typing.Iterable[typing.Union[dict, list, tuple]], - fname: typing.Optional[typing.Union[str, pathlib.Path]] = DEFAULT, - base: typing.Optional[typing.Union[str, pathlib.Path]] = None, - strict: typing.Optional[bool] = False, - _zipped: typing.Optional[bool] = False) -> typing.Union[str, int]: + items: Iterable[Union[dict, list, tuple]], + fname: Optional[Union[str, pathlib.Path]] = DEFAULT, + base: Optional[Union[str, pathlib.Path]] = None, + strict: Optional[bool] = False, + _zipped: Optional[bool] = False) -> Union[str, int]: """ Write row items to a CSV file according to the table schema. @@ -1307,7 +1283,7 @@ def iterdicts( fname=None, _Row=collections.OrderedDict, strict=True, - ) -> typing.Generator[dict, None, None]: + ) -> Generator[dict, None, None]: """Iterate over the rows of the table Create an iterator that maps the information in each row to a `dict` whose keys are @@ -1460,17 +1436,7 @@ def iterdicts( self._comments = reader.comments -def converter_tables(v): - res = [] - for vv in v: - if not isinstance(vv, (dict, Table)): - warnings.warn('Invalid value for Table spec') - else: - res.append(Table.fromvalue(vv) if isinstance(vv, dict) else vv) - return res - - -@attr.s +@dataclasses.dataclass class TableGroup(TableLike): """ A table group description is an object that describes a group of tables. @@ -1485,10 +1451,17 @@ class TableGroup(TableLike): .. seealso:: ``_ """ - tables = attr.ib(repr=False, default=attr.Factory(list), converter=converter_tables) + tables: list[Table] = dataclasses.field(default_factory=list) - def __attrs_post_init__(self): - TableLike.__attrs_post_init__(self) + def __post_init__(self): + res = [] + for vv in self.tables: + if not isinstance(vv, (dict, Table)): + warnings.warn('Invalid value for Table spec') + else: + res.append(Table.fromvalue(vv) if isinstance(vv, dict) else vv) + self.tables = res + super().__post_init__() for table in self.tables: table._parent = self @@ -1503,10 +1476,10 @@ def read(self): return {tname: list(t.iterdicts()) for tname, t in self.tabledict.items()} def write(self, - fname: typing.Union[str, pathlib.Path], - strict: typing.Optional[bool] = False, - _zipped: typing.Optional[bool] = False, - **items: typing.Iterable[typing.Union[list, tuple, dict]]): + fname: Union[str, pathlib.Path], + strict: Optional[bool] = False, + _zipped: Optional[bool] = False, + **items: Iterable[Union[list, tuple, dict]]): """ Write a TableGroup's data and metadata to files. @@ -1519,7 +1492,7 @@ def write(self, self.tabledict[tname].write(rows, base=fname.parent, strict=strict, _zipped=_zipped) self.to_file(fname) - def copy(self, dest: typing.Union[pathlib.Path, str]): + def copy(self, dest: Union[pathlib.Path, str]): """ Write a TableGroup's data and metadata to files relative to `dest`, adapting the `base` attribute. @@ -1534,10 +1507,10 @@ def copy(self, dest: typing.Union[pathlib.Path, str]): self.to_file(self._fname) @property - def tabledict(self) -> typing.Dict[str, Table]: + def tabledict(self) -> dict[str, Table]: return {t.local_name: t for t in self.tables} - def foreign_keys(self) -> typing.List[typing.Tuple[Table, list, Table, list]]: + def foreign_keys(self) -> list[tuple[Table, list, Table, list]]: return [ ( self.tabledict[fk.reference.resource.string], @@ -1550,6 +1523,7 @@ def foreign_keys(self) -> typing.List[typing.Tuple[Table, list, Table, list]]: def validate_schema(self, strict=False): try: for st, sc, tt, tc in self.foreign_keys(): + print(sc, tc) if len(sc) != len(tc): raise ValueError( 'Foreign key error: non-matching number of columns in source and target') @@ -1641,7 +1615,7 @@ class CSVW: """ Python API to read CSVW described data and convert it to JSON. """ - def __init__(self, url: str, md_url: typing.Optional[str] = None, validate: bool = False): + def __init__(self, url: str, md_url: Optional[str] = None, validate: bool = False): self.warnings = [] w = None with contextlib.ExitStack() as stack: @@ -1717,7 +1691,7 @@ def tablegroup(self): TableGroup(at_props={'base': self.t.base}, tables=self.tables) @staticmethod - def locate_metadata(url=None) -> typing.Tuple[dict, bool]: + def locate_metadata(url=None) -> tuple[dict, bool]: """ Implements metadata discovery as specified in `§5. Locating Metadata `_ diff --git a/src/csvw/utils.py b/src/csvw/utils.py index affbfdf..109fc91 100644 --- a/src/csvw/utils.py +++ b/src/csvw/utils.py @@ -8,8 +8,11 @@ import warnings import collections import unicodedata +from typing import Callable, Any -import attr + +def optional(type_: type) -> Callable[[Any], Any]: + return lambda v: v if v is None else type_(v) def is_url(s): @@ -35,23 +38,15 @@ def ensure_path(fname): return fname -def attr_defaults(cls): - res = collections.OrderedDict() - for field in attr.fields(cls): - default = field.default - if isinstance(default, attr.Factory): - default = default.factory() - res[field.name] = default - return res - - def attr_asdict(obj, omit_defaults=True, omit_private=True): - defs = attr_defaults(obj.__class__) + import dataclasses + res = collections.OrderedDict() - for field in attr.fields(obj.__class__): + for field in dataclasses.fields(obj): + default = field.default_factory() if callable(field.default_factory) else field.default if not (omit_private and field.name.startswith('_')): value = getattr(obj, field.name) - if not (omit_defaults and value == defs[field.name]): + if not (omit_defaults and value == default): if hasattr(value, 'asdict'): value = value.asdict(omit_defaults=True) res[field.name] = value From dafd6d1b45246e8aa51946cd51914a7ca5559ec8 Mon Sep 17 00:00:00 2001 From: Robert Forkel Date: Sun, 1 Mar 2026 20:24:37 +0100 Subject: [PATCH 02/17] work on removing dateutil dep --- setup.cfg | 1 - src/csvw/_compat.py | 21 +++++++++++++++++++++ src/csvw/datatypes.py | 8 ++++---- 3 files changed, 25 insertions(+), 5 deletions(-) create mode 100644 src/csvw/_compat.py diff --git a/setup.cfg b/setup.cfg index b54b6ba..3a61226 100644 --- a/setup.cfg +++ b/setup.cfg @@ -40,7 +40,6 @@ package_dir = = src python_requires = >=3.9 install_requires = - attrs>=18.1 isodate python-dateutil # Pin until fix for 2.0.0 is released (https://pypi.org/project/rfc3986/#history): diff --git a/src/csvw/_compat.py b/src/csvw/_compat.py new file mode 100644 index 0000000..83cbebd --- /dev/null +++ b/src/csvw/_compat.py @@ -0,0 +1,21 @@ +import re +import sys +import datetime + + +if (sys.version_info.major, sys.version_info.minor) >= (3, 11): # pragma: no cover + fromisoformat = datetime.datetime.fromisoformat +else: + def fromisoformat(s: str) -> datetime.datetime: # pragma: no cover + """Somewhat hacky backport of the more full-fledged date parsing support in py3.11.""" + s = s.replace('Z', '+00:00') + ms_p = re.compile(r'(?P\.[0-9]+)') + m = ms_p.search(s) + ms = None + if m: + s = ms_p.sub('', s) + ms = float(f'0{ms}') + res = datetime.datetime.fromisoformat(s) + if ms: + res = res.replace(microsecond=(ms * 1000000) % 1000000) + return res diff --git a/src/csvw/datatypes.py b/src/csvw/datatypes.py index ad17973..3a5591c 100644 --- a/src/csvw/datatypes.py +++ b/src/csvw/datatypes.py @@ -22,10 +22,12 @@ import isodate import rfc3986 -import dateutil.parser import babel.numbers import babel.dates import jsonschema +import dateutil.parser + +from ._compat import fromisoformat if typing.TYPE_CHECKING: # pragma: no cover import csvw @@ -329,7 +331,6 @@ def _parse(v, cls, regex, tz_marker=None): comps[a] = getattr(d, a) res = cls(**{k: int(v) for k, v in comps.items() if v is not None}) if tz_marker: - # Let dateutils take care of parsing the timezone info: res = res.replace(tzinfo=dateutil.parser.parse(v).tzinfo) return res @@ -340,7 +341,7 @@ def to_python(v, regex=None, fmt=None, tz_marker=None, pattern=None): if not match: raise ValueError('{} -- {} -- {}'.format(pattern, v, regex)) # pragma: try: - return dateutil.parser.isoparse(v) + return fromisoformat(v) except ValueError: return dateTime._parse(v, datetime.datetime, regex, tz_marker=tz_marker) @@ -1061,7 +1062,6 @@ def dt_format_and_regex(fmt, no_date=False): "MM.dd.yyyy", # e.g., 03.22.2015 "M.d.yyyy", # e.g., 3.22.2015 } - time_patterns = {"HH:mm:ss", "HHmmss", "HH:mm", "HHmm"} # We map dateTime component markers to corresponding fromat specs and regular From fe00a69b2beff427487817b6d53898d27a72fc63 Mon Sep 17 00:00:00 2001 From: Robert Forkel Date: Sun, 1 Mar 2026 21:56:24 +0100 Subject: [PATCH 03/17] started linting --- src/csvw/db.py | 6 +- src/csvw/dsv_dialects.py | 4 +- src/csvw/metadata.py | 196 ++++++++++++++++++++++----------------- 3 files changed, 119 insertions(+), 87 deletions(-) diff --git a/src/csvw/db.py b/src/csvw/db.py index 76c75ce..7f947d2 100644 --- a/src/csvw/db.py +++ b/src/csvw/db.py @@ -197,7 +197,8 @@ class TableSpec: name: str columns: list[ColSpec] = dataclasses.field(default_factory=list) foreign_keys: list = dataclasses.field(default_factory=list) - many_to_many: collections.OrderedDict = dataclasses.field(default_factory=collections.OrderedDict) + many_to_many: collections.OrderedDict = dataclasses.field( + default_factory=collections.OrderedDict) primary_key: Optional[list[str]] = None @classmethod @@ -442,7 +443,8 @@ def read(self) -> dict[str, list[collections.OrderedDict]]: for col in table.columns: convert[self.translate(tname, col.name)] = [col.name, identity] if col.csvw_type in TYPE_MAP: - convert[self.translate(tname, col.name)][1] = TYPE_MAP[col.csvw_type].convert + convert[self.translate(tname, col.name)][1] = \ + TYPE_MAP[col.csvw_type].convert else: convert[self.translate(tname, col.name)][1] = \ DATATYPES[col.csvw_type].to_python diff --git a/src/csvw/dsv_dialects.py b/src/csvw/dsv_dialects.py index fead1a1..412a880 100644 --- a/src/csvw/dsv_dialects.py +++ b/src/csvw/dsv_dialects.py @@ -69,7 +69,9 @@ def __post_init__(self): self.skipColumns = utils.converter(int, 0, self.skipColumns, cond=lambda s: s >= 0) self.skipBlankRows = utils.converter(bool, False, self.skipBlankRows) self.skipInitialSpace = utils.converter(bool, False, self.skipInitialSpace) - self.trim = utils.converter((str, bool), 'false', str(self.trim).lower() if isinstance(self.trim, bool) else self.trim) + self.trim = utils.converter( + (str, bool), 'false', str(self.trim).lower() + if isinstance(self.trim, bool) else self.trim) assert self.trim in ['true', 'false', 'start', 'end'], 'invalid trim' def updated(self, **kw): diff --git a/src/csvw/metadata.py b/src/csvw/metadata.py index 0539afc..c3abd15 100644 --- a/src/csvw/metadata.py +++ b/src/csvw/metadata.py @@ -7,7 +7,6 @@ .. seealso:: https://www.w3.org/TR/tabular-metadata/ """ -import datetime import io import re import json @@ -16,6 +15,7 @@ import pathlib from typing import Optional, Union, Any, Literal import zipfile +import datetime import operator import warnings import functools @@ -695,8 +695,8 @@ def __post_init__(self): if not all(isinstance(vv, str) for vv in self.null): warnings.warn('Invalid null property') self.null = [""] - self.ordered = utils.converter( bool, False, self.ordered, allow_none=True) - self.separator = utils.converter( str, None, self.separator, allow_none=True) + self.ordered = utils.converter(bool, False, self.ordered, allow_none=True) + self.separator = utils.converter(str, None, self.separator, allow_none=True) self.textDirection = utils.converter( str, None, @@ -740,8 +740,8 @@ class Column(Description): def __post_init__(self): super().__post_init__() - self.name = utils.converter( str, None, self.name, allow_none=True) - self.suppressOutput = utils.converter( bool, False, self.suppressOutput) + self.name = utils.converter(str, None, self.name, allow_none=True) + self.suppressOutput = utils.converter(bool, False, self.suppressOutput) if self.titles is not None: try: @@ -750,7 +750,7 @@ def __post_init__(self): warnings.warn('Invalid titles property') self.titles = None - self.virtual = utils.converter( bool, False, self.virtual) + self.virtual = utils.converter(bool, False, self.virtual) def __str__(self): return self.name or \ @@ -928,7 +928,6 @@ def __post_init__(self): names.add(col.name) seen.add(col.header) col._parent = self - #col._number = i + 1 for colref in self.primaryKey or []: col = self.columndict.get(colref) if col and not col.name: @@ -1523,7 +1522,6 @@ def foreign_keys(self) -> list[tuple[Table, list, Table, list]]: def validate_schema(self, strict=False): try: for st, sc, tt, tc in self.foreign_keys(): - print(sc, tc) if len(sc) != len(tc): raise ValueError( 'Foreign key error: non-matching number of columns in source and target') @@ -1561,55 +1559,84 @@ def check_referential_integrity(self, data=None, log=None, strict=False): except ValueError as e: success = False log_or_raise(str(e), log=log, level='error') - fkeys = self.foreign_keys() + fkeys = [ForeignKeyInstance(*fk) for fk in self.foreign_keys()] # FIXME: We only support Foreign Key references between tables! - fkeys = sorted(fkeys, key=lambda x: (x[0].local_name, x[1], x[2].local_name)) + fkeys = sorted( + fkeys, + key=lambda x: (x.source_table.local_name, x.source_colref, x.target_table.local_name)) # Grouping by local_name of tables - even though we'd like to have the table objects # around, too. This it to prevent going down the rabbit hole of comparing table objects # for equality, when comparison of the string names is enough. - for _, grp in itertools.groupby(fkeys, lambda x: x[0].local_name): - grp = list(grp) - table = grp[0][0] - t_fkeys = [(key, [(child, ref) for _, _, child, ref in kgrp]) - for key, kgrp in itertools.groupby(grp, lambda x: x[1])] - get_seen = [(operator.itemgetter(*key), set()) for key, _ in t_fkeys] - for row in table.iterdicts(log=log): - for get, seen in get_seen: - if get(row) in seen: - # column references for a foreign key are not unique! - if strict: - success = False - seen.add(get(row)) - for (key, children), (_, seen) in zip(t_fkeys, get_seen): - single_column = (len(key) == 1) - for child, ref in children: - get_ref = operator.itemgetter(*ref) - for fname, lineno, item in child.iterdicts(log=log, with_metadata=True): - colref = get_ref(item) - if colref is None: - continue - elif single_column and isinstance(colref, list): - # We allow list-valued columns as foreign key columns in case - # it's not a composite key. If a foreign key is list-valued, we - # check for a matching row for each of the values in the list. - colrefs = colref - else: - colrefs = [colref] - for colref in colrefs: - if not single_column and None in colref: # pragma: no cover - # TODO: raise if any(c is not None for c in colref)? - continue - elif colref not in seen: - log_or_raise( - '{0}:{1} Key `{2}` not found in table {3}'.format( - fname, - lineno, - colref, - table.url.string), - log=log) - success = False + for _, grp in itertools.groupby(fkeys, lambda x: x.source_table.local_name): + success = self._check_group(success, list(grp), strict, log) + return success + + def _check_group(self, success, grp: list['ForeignKeyInstance'], strict, log): + """Check all foreign keys defined on one table.""" + t_fkeys = [(key, [(fk.target_table, fk.target_colref) for fk in kgrp]) + for key, kgrp in itertools.groupby(grp, lambda x: x.source_colref)] + get_seen = [(operator.itemgetter(*key), set()) for key, _ in t_fkeys] + for row in grp[0].source_table.iterdicts(log=log): + for get, seen in get_seen: + if get(row) in seen: + # column references for a foreign key are not unique! + # https://w3c.github.io/csvw/tests/#manifest-validation#test258 + if strict: + success = False + seen.add(get(row)) + for (key, children), (_, seen) in zip(t_fkeys, get_seen): + for child, ref in children: + for fname, lineno, item in child.iterdicts(log=log, with_metadata=True): + item = RowItem( + table=grp[0].source_table, + fname=fname, + lineno=lineno, + item=item, + colref=operator.itemgetter(*ref)(item)) + success = self._check_item(success, item, seen, len(key) == 1, log) return success + def _check_item(self, success, item, seen, single_column, log): # pylint: disable=R0913,R0917 + if item.colref is None: + return success + if single_column and isinstance(item.colref, list): + # We allow list-valued columns as foreign key columns in case + # it's not a composite key. If a foreign key is list-valued, we + # check for a matching row for each of the values in the list. + colrefs = item.colref + else: + colrefs = [item.colref] + for colref in colrefs: + if not single_column and None in colref: # pragma: no cover + # TODO: raise if any(c is not None for c in colref)? + continue + if colref not in seen: + log_or_raise(f'{item} not found in table {item.table.url.string}', log=log) + success = False + return success + + +@dataclasses.dataclass(frozen=True) +class ForeignKeyInstance: + """Simple structure holding the specification of a foreign key.""" + source_table: Table + source_colref: list[str] + target_table: Table + target_colref: list[str] + + +@dataclasses.dataclass(frozen=True) +class RowItem: + """Bundle properties of a table row for simpler checking.""" + table: Table + fname: str + lineno: int + item: dict + colref: Union[str, list[str]] + + def __str__(self): + return f'{self.fname}:{self.lineno} Key `{self.colref}`' + class CSVW: """ @@ -1634,24 +1661,7 @@ def __init__(self, url: str, md_url: Optional[str] = None, validate: bool = Fals self.no_metadata = set(md.keys()) == {'@context', 'url'} if "http://www.w3.org/ns/csvw" not in md.get('@context', ''): raise ValueError('Invalid or no @context') - if 'tables' in md: - if not md['tables'] or not isinstance(md['tables'], list): - raise ValueError('Invalid TableGroup with empty tables property') - if is_url(url): - self.t = TableGroup.from_url(url, data=md) - self.t.validate_schema(strict=True) - else: - self.t = TableGroup.from_file(url, data=md) - else: - if is_url(url): - self.t = Table.from_url(url, data=md) - if no_header: - if self.t.dialect: - self.t.dialect.header = False # pragma: no cover - else: - self.t.dialect = Dialect(header=False) - else: - self.t = Table.from_file(url, data=md) + self._set_tables(md, url, no_header) self.tables = self.t.tables if isinstance(self.t, TableGroup) else [self.t] for table in self.tables: for col in table.tableSchema.columns: @@ -1661,6 +1671,26 @@ def __init__(self, url: str, md_url: Optional[str] = None, validate: bool = Fals if w: self.warnings.extend(w) + def _set_tables(self, md, url, no_header): + if 'tables' in md: + if not md['tables'] or not isinstance(md['tables'], list): + raise ValueError('Invalid TableGroup with empty tables property') + if is_url(url): + self.t = TableGroup.from_url(url, data=md) + self.t.validate_schema(strict=True) + else: + self.t = TableGroup.from_file(url, data=md) + else: + if is_url(url): + self.t = Table.from_url(url, data=md) + if no_header: + if self.t.dialect: + self.t.dialect.header = False # pragma: no cover + else: + self.t.dialect = Dialect(header=False) + else: + self.t = Table.from_file(url, data=md) + @property def is_valid(self) -> bool: """ @@ -1686,7 +1716,8 @@ def is_valid(self) -> bool: return not bool(self.warnings) @property - def tablegroup(self): + def tablegroup(self) -> TableGroup: + """The table spec.""" return self.t if isinstance(self.t, TableGroup) else \ TableGroup(at_props={'base': self.t.base}, tables=self.tables) @@ -1708,7 +1739,7 @@ def describes(md, url): if url and is_url(url): # §5.2 Link Header # https://w3c.github.io/csvw/syntax/#link-header - res = requests.head(url) + res = requests.head(url, timeout=10) no_header = bool(re.search(r'header\s*=\s*absent', res.headers.get('content-type', ''))) desc = res.links.get('describedby') if desc and desc['type'] in [ @@ -1716,16 +1747,15 @@ def describes(md, url): md = get_json(Link(desc['url']).resolve(url)) if describes(md, url): return md, no_header - else: - warnings.warn('Ignoring linked metadata because it does not reference the data') + warnings.warn('Ignoring linked metadata because it does not reference the data') # §5.3 Default Locations and Site-wide Location Configuration # https://w3c.github.io/csvw/syntax/ # #default-locations-and-site-wide-location-configuration - res = requests.get(Link('/.well-known/csvm').resolve(url)) + res = requests.get(Link('/.well-known/csvm').resolve(url), timeout=10) locs = res.text if res.status_code == 200 else '{+url}-metadata.json\ncsv-metadata.json' for line in locs.split('\n'): - res = requests.get(Link(URITemplate(line).expand(url=url)).resolve(url)) + res = requests.get(Link(URITemplate(line).expand(url=url)).resolve(url), timeout=10) if res.status_code == 200: try: md = res.json() @@ -1793,14 +1823,14 @@ def _table_to_json(self, table): for rownum, (_, rowsourcenum, row) in enumerate( table.iterdicts(with_metadata=True, strict=False), start=1) ] - if table._comments: - res['rdfs:comment'] = [c[1] for c in table._comments] + if table._comments: # pylint: disable=W0212 + res['rdfs:comment'] = [c[1] for c in table._comments] # pylint: disable=W0212 res['row'] = row return res - def _row_to_json(self, table, cols, row, rownum, rowsourcenum): + def _row_to_json(self, table, cols, row, rownum, rowsourcenum): # pylint: disable=R0913,R0917 res = collections.OrderedDict() - res['url'] = '{}#row={}'.format(table.url.resolve(table.base), rowsourcenum) + res['url'] = f'{table.url.resolve(table.base)}#row={rowsourcenum}' res['rownum'] = rownum if table.tableSchema.rowTitles: res['titles'] = [ @@ -1816,7 +1846,7 @@ def _row_to_json(self, table, cols, row, rownum, rowsourcenum): def _describes(self, table, cols, row, rownum): triples = [] - aboutUrl = table.tableSchema.inherit('aboutUrl') + aboutUrl = table.tableSchema.inherit('aboutUrl') # pylint: disable=invalid-name if aboutUrl: triples.append(jsonld.Triple( about=None, property='@id', value=table.expand(aboutUrl, row, _row=rownum))) @@ -1828,16 +1858,14 @@ def _describes(self, table, cols, row, rownum): # Skip null values: null = col.inherit_null() if col else table.inherit_null() - if (null and v in null) or v == "" or (v is None) or \ - (col and col.separator and v == []): + if any([null and v in null, v == "", v is None, col and col.separator and v == []]): continue triples.append(jsonld.Triple.from_col( table, col, row, - '_col.{}'.format(i) - if (not table.tableSchema.columns and not self.no_metadata) else k, + f'_col.{i}' if (not table.tableSchema.columns and not self.no_metadata) else k, v, rownum)) From df41eb93ec0966a7fc9f47bd3884358c2f70d7de Mon Sep 17 00:00:00 2001 From: Robert Forkel Date: Mon, 2 Mar 2026 11:04:30 +0100 Subject: [PATCH 04/17] refactored validation of referential integrity --- src/csvw/metadata.py | 186 ++++++++++++++++++++++++++++--------------- 1 file changed, 122 insertions(+), 64 deletions(-) diff --git a/src/csvw/metadata.py b/src/csvw/metadata.py index c3abd15..dc5d2f0 100644 --- a/src/csvw/metadata.py +++ b/src/csvw/metadata.py @@ -8,6 +8,7 @@ .. seealso:: https://www.w3.org/TR/tabular-metadata/ """ import io +import logging import re import json import shutil @@ -179,6 +180,7 @@ OrderedType = Union[ int, float, decimal.Decimal, datetime.date, datetime.datetime, datetime.timedelta] +ColRefType = tuple[str] class Invalid: @@ -1509,33 +1511,20 @@ def copy(self, dest: Union[pathlib.Path, str]): def tabledict(self) -> dict[str, Table]: return {t.local_name: t for t in self.tables} - def foreign_keys(self) -> list[tuple[Table, list, Table, list]]: + def foreign_keys(self) -> list[tuple[Table, ColRefType, Table, ColRefType]]: return [ ( self.tabledict[fk.reference.resource.string], - fk.reference.columnReference, + tuple(fk.reference.columnReference), t, - fk.columnReference) + tuple(fk.columnReference)) for t in self.tables for fk in t.tableSchema.foreignKeys if not fk.reference.schemaReference] def validate_schema(self, strict=False): try: - for st, sc, tt, tc in self.foreign_keys(): - if len(sc) != len(tc): - raise ValueError( - 'Foreign key error: non-matching number of columns in source and target') - for scol, tcol in zip(sc, tc): - scolumn = st.tableSchema.get_column(scol, strict=strict) - tcolumn = tt.tableSchema.get_column(tcol, strict=strict) - if not (scolumn and tcolumn): - raise ValueError( - 'Foregin key error: missing column "{}" or "{}"'.format(scol, tcol)) - if scolumn.datatype and tcolumn.datatype and \ - scolumn.datatype.base != tcolumn.datatype.base: - raise ValueError( - 'Foregin key error: non-matching datatype "{}:{}" or "{}:{}"'.format( - scol, scolumn.datatype.base, tcol, tcolumn.datatype.base)) + for fki in [ForeignKeyInstance(*fk) for fk in self.foreign_keys()]: + fki.validate(strict=strict) except (KeyError, AssertionError) as e: raise ValueError('Foreign key error: missing table "{}" referenced'.format(e)) @@ -1559,83 +1548,152 @@ def check_referential_integrity(self, data=None, log=None, strict=False): except ValueError as e: success = False log_or_raise(str(e), log=log, level='error') + fkeys = [ForeignKeyInstance(*fk) for fk in self.foreign_keys()] # FIXME: We only support Foreign Key references between tables! - fkeys = sorted( - fkeys, - key=lambda x: (x.source_table.local_name, x.source_colref, x.target_table.local_name)) + # We group foreign key constraints by target table, because we only want to read the + # available primary keys once and then check all tables referencing the target table in + # a loop. + # # Grouping by local_name of tables - even though we'd like to have the table objects # around, too. This it to prevent going down the rabbit hole of comparing table objects # for equality, when comparison of the string names is enough. - for _, grp in itertools.groupby(fkeys, lambda x: x.source_table.local_name): - success = self._check_group(success, list(grp), strict, log) + fkeys = sorted( + fkeys, + key=lambda x: (x.target_table.local_name, x.pk, x.source_table.local_name)) + for _, grp in itertools.groupby(fkeys, lambda x: x.target_table.local_name): + grp = list(grp) + target_table = grp[0].target_table + fks = collections.OrderedDict() + for pk, kgrp in itertools.groupby(grp, lambda x: x.pk): + fks[tuple(pk)] = [(fk.source_table, tuple(fk.fk)) for fk in kgrp] + success = self._check_fks_referencing_table(success, target_table, fks, strict, log) + return success + + @staticmethod + def _check_fks_referencing_table( + success: bool, + target_table: Table, + fks: collections.OrderedDict[ColRefType, list[tuple[Table, ColRefType]]], + strict: bool, + log: logging.Logger, + ) -> bool: + """Check all foreign keys referencing the same table.""" + target_table = ReferencedTable( + target_table, collections.OrderedDict((fk, len(fk) == 1) for fk in fks), log) + # Now read the available primary keys for each foreign key constraint to the table. + success = target_table.get_pks(success, strict) + for pk, source_tables in fks.items(): + # For each foreign key constraint referencing `target_table` we check the fk values. + for source_table, fk in source_tables: + success = target_table.check_fks(success, pk, source_table, fk) return success - def _check_group(self, success, grp: list['ForeignKeyInstance'], strict, log): - """Check all foreign keys defined on one table.""" - t_fkeys = [(key, [(fk.target_table, fk.target_colref) for fk in kgrp]) - for key, kgrp in itertools.groupby(grp, lambda x: x.source_colref)] - get_seen = [(operator.itemgetter(*key), set()) for key, _ in t_fkeys] - for row in grp[0].source_table.iterdicts(log=log): - for get, seen in get_seen: - if get(row) in seen: - # column references for a foreign key are not unique! + +@dataclasses.dataclass +class ReferencedTable: + """ + Wraps a Table object to simplify checking of foreign key references. + """ + table: Table + # The colrefs which are referenced in foreign keys to the table mapped to whether they are a + # single column or a composite key: + pks: collections.OrderedDict[ColRefType, bool] + log: logging.Logger + # We store values in table rows for each pk colref: + refs: dict[ColRefType, set] = dataclasses.field( + default_factory=lambda: collections.defaultdict(set)) + + def get_pks(self, success: bool, strict: bool) -> bool: + """Read the actual fk values in the table.""" + itemgetters = {pk: operator.itemgetter(*pk) for pk in self.pks} + for row in self.table.iterdicts(log=self.log): + for pk in self.pks: + vals = itemgetters[pk](row) + if vals in self.refs[pk]: + # Values for a primary key are not unique! # https://w3c.github.io/csvw/tests/#manifest-validation#test258 if strict: success = False - seen.add(get(row)) - for (key, children), (_, seen) in zip(t_fkeys, get_seen): - for child, ref in children: - for fname, lineno, item in child.iterdicts(log=log, with_metadata=True): - item = RowItem( - table=grp[0].source_table, - fname=fname, - lineno=lineno, - item=item, - colref=operator.itemgetter(*ref)(item)) - success = self._check_item(success, item, seen, len(key) == 1, log) + self.refs[pk].add(vals) return success - def _check_item(self, success, item, seen, single_column, log): # pylint: disable=R0913,R0917 - if item.colref is None: + def _check_item(self, success: bool, ref: 'RefValues', pk: ColRefType) -> bool: + """ + We check if the value for the foreign key are available in the referenced table. + """ + pks = self.refs[pk] + single_column = self.pks[pk] + if ref.values is None: # null-valued foreign key. return success - if single_column and isinstance(item.colref, list): - # We allow list-valued columns as foreign key columns in case - # it's not a composite key. If a foreign key is list-valued, we - # check for a matching row for each of the values in the list. - colrefs = item.colref + if single_column and isinstance(ref.values, list): + # We allow list-valued columns as foreign key columns in case it's not a composite key. + # If a foreign key is list-valued, we check for a matching row for each of the values + # in the list. + refs = ref.values else: - colrefs = [item.colref] - for colref in colrefs: - if not single_column and None in colref: # pragma: no cover - # TODO: raise if any(c is not None for c in colref)? + refs = [ref.values] + for ref in refs: + if not single_column and None in ref: # pragma: no cover + # A composite key and one component of the fk is null? + # TODO: raise if any(c is not None for c in values)? continue - if colref not in seen: - log_or_raise(f'{item} not found in table {item.table.url.string}', log=log) + if ref not in pks: + log_or_raise(f'{ref} not found in table {self.table.url.string}', log=self.log) success = False return success + def check_fks( + self, + success: bool, + pk: ColRefType, + source_table: Table, + fk: ColRefType, + ) -> bool: + """ + Check one fk constraint, i.e. whether the fk values in self.table actually can be found + in `target_table`. + """ + for fname, lineno, item in source_table.iterdicts(log=self.log, with_metadata=True): + item = RefValues(fname=fname, lineno=lineno, values=operator.itemgetter(*fk)(item)) + success = self._check_item(success, item, pk) + return success + @dataclasses.dataclass(frozen=True) class ForeignKeyInstance: """Simple structure holding the specification of a foreign key.""" - source_table: Table - source_colref: list[str] target_table: Table - target_colref: list[str] + pk: tuple[str] + source_table: Table + fk: tuple[str] + + def validate(self, strict: bool): + if len(self.fk) != len(self.pk): + raise ValueError( + 'Foreign key error: non-matching number of columns in source and target') + for scol, tcol in zip(self.fk, self.pk): + scolumn = self.source_table.tableSchema.get_column(scol, strict=strict) + tcolumn = self.target_table.tableSchema.get_column(tcol, strict=strict) + if not (scolumn and tcolumn): + raise ValueError( + f'Foreign key error: missing column "{scol}" or "{tcol}"') + if scolumn.datatype and tcolumn.datatype and \ + scolumn.datatype.base != tcolumn.datatype.base: + raise ValueError( + 'Foregin key error: non-matching datatype "{}:{}" or "{}:{}"'.format( + scol, scolumn.datatype.base, tcol, tcolumn.datatype.base)) @dataclasses.dataclass(frozen=True) -class RowItem: +class RefValues: """Bundle properties of a table row for simpler checking.""" - table: Table fname: str lineno: int - item: dict - colref: Union[str, list[str]] + values: Union[str, list[str]] def __str__(self): - return f'{self.fname}:{self.lineno} Key `{self.colref}`' + return f'{self.fname}:{self.lineno} Key `{self.values}`' class CSVW: From b5fbf7d9d4fbe4dc51d24b5765cc5fb1c866bc29 Mon Sep 17 00:00:00 2001 From: Robert Forkel Date: Mon, 2 Mar 2026 17:15:31 +0100 Subject: [PATCH 05/17] more linting --- src/csvw/dsv_dialects.py | 3 +- src/csvw/metadata.py | 1158 ++++++++++++++-------------------- src/csvw/metadata_utils.py | 335 ++++++++++ src/csvw/utils.py | 50 +- tests/conftest.py | 3 +- tests/test_metadata.py | 2 +- tests/test_metadata_utils.py | 30 + 7 files changed, 886 insertions(+), 695 deletions(-) create mode 100644 src/csvw/metadata_utils.py create mode 100644 tests/test_metadata_utils.py diff --git a/src/csvw/dsv_dialects.py b/src/csvw/dsv_dialects.py index 412a880..cc0b320 100644 --- a/src/csvw/dsv_dialects.py +++ b/src/csvw/dsv_dialects.py @@ -15,6 +15,7 @@ import dataclasses from . import utils +from .metadata_utils import dataclass_asdict __all__ = ['Dialect'] @@ -101,7 +102,7 @@ def trimmer(self): }[self.trim] def asdict(self, omit_defaults=True): - return utils.attr_asdict(self, omit_defaults=omit_defaults) + return dataclass_asdict(self, omit_defaults=omit_defaults) @property def python_encoding(self): diff --git a/src/csvw/metadata.py b/src/csvw/metadata.py index dc5d2f0..3337716 100644 --- a/src/csvw/metadata.py +++ b/src/csvw/metadata.py @@ -1,5 +1,4 @@ -# metadata.py - +# pylint: disable=too-many-lines """Functionality to read and write metadata for CSV files. This module implements (partially) the W3C recommendation @@ -14,7 +13,7 @@ import shutil import decimal import pathlib -from typing import Optional, Union, Any, Literal +from typing import Optional, Union, Any, Literal, TypeVar import zipfile import datetime import operator @@ -36,151 +35,23 @@ from .dsv import Dialect as BaseDialect, UnicodeReaderWithLineNumber, UnicodeWriter from .frictionless import DataPackage from . import jsonld +from .metadata_utils import DescriptionBase, dataclass_asdict, NAMESPACES, dialect_props, \ + valid_context_property DEFAULT = object() __all__ = [ - 'TableGroup', - 'Table', 'Column', 'ForeignKey', - 'Link', 'NaturalLanguage', - 'Datatype', - 'is_url', - 'CSVW', + 'TableGroup', 'Table', 'Column', 'ForeignKey', 'Link', 'NaturalLanguage', 'Datatype', + 'is_url', 'CSVW', ] -NAMESPACES = { - 'csvw': 'http://www.w3.org/ns/csvw#', - 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', - 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', - 'xsd': 'http://www.w3.org/2001/XMLSchema#', - 'dc': 'http://purl.org/dc/terms/', - 'dcat': 'http://www.w3.org/ns/dcat#', - 'prov': 'http://www.w3.org/ns/prov#', - 'schema': 'http://schema.org/', - "as": "https://www.w3.org/ns/activitystreams#", - "cc": "http://creativecommons.org/ns#", - "ctag": "http://commontag.org/ns#", - "dc11": "http://purl.org/dc/elements/1.1/", - "dctypes": "http://purl.org/dc/dcmitype/", - "dqv": "http://www.w3.org/ns/dqv#", - "duv": "https://www.w3.org/ns/duv#", - "foaf": "http://xmlns.com/foaf/0.1/", - "gr": "http://purl.org/goodrelations/v1#", - "grddl": "http://www.w3.org/2003/g/data-view#", - "ical": "http://www.w3.org/2002/12/cal/icaltzd#", - "jsonld": "http://www.w3.org/ns/json-ld#", - "ldp": "http://www.w3.org/ns/ldp#", - "ma": "http://www.w3.org/ns/ma-ont#", - "oa": "http://www.w3.org/ns/oa#", - "odrl": "http://www.w3.org/ns/odrl/2/", - "og": "http://ogp.me/ns#", - "org": "http://www.w3.org/ns/org#", - "owl": "http://www.w3.org/2002/07/owl#", - "qb": "http://purl.org/linked-data/cube#", - "rdfa": "http://www.w3.org/ns/rdfa#", - "rev": "http://purl.org/stuff/rev#", - "rif": "http://www.w3.org/2007/rif#", - "rr": "http://www.w3.org/ns/r2rml#", - "sd": "http://www.w3.org/ns/sparql-service-description#", - "sioc": "http://rdfs.org/sioc/ns#", - "skos": "http://www.w3.org/2004/02/skos/core#", - "skosxl": "http://www.w3.org/2008/05/skos-xl#", - "sosa": "http://www.w3.org/ns/sosa/", - "ssn": "http://www.w3.org/ns/ssn/", - "time": "http://www.w3.org/2006/time#", - "v": "http://rdf.data-vocabulary.org/#", - "vcard": "http://www.w3.org/2006/vcard/ns#", - "void": "http://rdfs.org/ns/void#", - "wdr": "http://www.w3.org/2007/05/powder#", - "wrds": "http://www.w3.org/2007/05/powder-s#", - "xhv": "http://www.w3.org/1999/xhtml/vocab#", - "xml": "http://www.w3.org/XML/1998/namespace", -} -CSVW_TERMS = """Cell -Column -Datatype -Dialect -Direction -ForeignKey -JSON -NumericFormat -Row -Schema -Table -TableGroup -TableReference -Transformation -aboutUrl -base -columnReference -columns -commentPrefix -datatype -decimalChar -default -delimiter -describes -dialect -doubleQuote -encoding -foreignKeys -format -groupChar -header -headerRowCount -json -lang -length -lineTerminators -maxExclusive -maxInclusive -maxLength -maximum -minExclusive -minInclusive -minLength -minimum -name -notes -null -ordered -pattern -primaryKey -propertyUrl -quoteChar -reference -referencedRows -required -resource -row -rowTitles -rownum -schemaReference -scriptFormat -separator -skipBlankRows -skipColumns -skipInitialSpace -skipRows -source -suppressOutput -tableDirection -tableSchema -tables -targetFormat -textDirection -titles -transformations -trim -uriTemplate -url -valueUrl -virtual""".split() is_url = utils.is_url OrderedType = Union[ int, float, decimal.Decimal, datetime.date, datetime.datetime, datetime.timedelta] ColRefType = tuple[str] +RowType = collections.OrderedDict[str, Any] +T = TypeVar('T') class Invalid: @@ -214,37 +85,6 @@ class Dialect(BaseDialect): commentPrefix: str = None -def json_open(filename, mode='r', encoding='utf-8'): - assert encoding == 'utf-8' - return io.open(filename, mode, encoding=encoding) - - -def get_json(fname) -> Union[list, dict]: - fname = str(fname) - if is_url(fname): - return requests.get(fname).json(object_pairs_hook=collections.OrderedDict) - with json_open(fname) as f: - return json.load(f, object_pairs_hook=collections.OrderedDict) - - -def log_or_raise(msg, log=None, level='warning', exception_cls=ValueError): - if log: - getattr(log, level)(msg) - else: - raise exception_cls(msg) - - -def nolog(level='warning'): - from types import MethodType - - class Log(object): - pass - - log = Log() - setattr(log, level, MethodType(lambda *args, **kw: None, log)) - return log - - class URITemplate(uritemplate.URITemplate): def __eq__(self, other): @@ -252,10 +92,10 @@ def __eq__(self, other): return self.uri == other if not hasattr(other, 'uri'): return False - return super(URITemplate, self).__eq__(other) + return super().__eq__(other) - def asdict(self, **kw): - return '{}'.format(self) + def asdict(self, **_): + return f'{self}' def convert_uri_template(v): @@ -278,14 +118,22 @@ def __init__(self, string: Union[str, pathlib.Path]): raise ValueError('Invalid value for link property') self.string = string + @classmethod + def from_value(cls, v: Union['Link', str, pathlib.Path]): + if isinstance(v, Link): + return v + return cls(v) + def __str__(self): return self.string - def asdict(self, omit_defaults=True): + def asdict(self, **_): + """Not really a dict, but at least a JSON-serializable datatype.""" return self.string def __eq__(self, other): - # FIXME: Only naive, un-resolved comparison is supported at the moment. + # FIXME: pylint: disable=W0511 + # Only naive, un-resolved comparison is supported at the moment. return self.string == other.string if isinstance(other, Link) else False def resolve(self, base): @@ -305,12 +153,14 @@ def resolve(self, base): class NaturalLanguage(collections.OrderedDict): """ + A natural language property holds a collection of string, optionally categorized into languages. .. seealso:: http://w3c.github.io/csvw/metadata/#natural-language-properties """ - - def __init__(self, value): - super(NaturalLanguage, self).__init__() + def __init__( + self, + value: Union[str, list[str], tuple[str], dict[str, Union[str, list[str], tuple[str]]]]): + super().__init__() self.value = value if isinstance(self.value, str): self[None] = [self.value] @@ -335,7 +185,8 @@ def __init__(self, value): else: raise ValueError('invalid value type for NaturalLanguage') - def asdict(self, omit_defaults=True): + def asdict(self, **_): + """Serialize as dict.""" if list(self) == [None]: if len(self[None]) == 1: return self.getfirst() @@ -344,149 +195,22 @@ def asdict(self, omit_defaults=True): ('und' if k is None else k, v[0] if len(v) == 1 else v) for k, v in self.items()) - def add(self, string, lang=None): + def add(self, string: str, lang: Optional[str] = None) -> None: + """Add a string for a language.""" if lang not in self: self[lang] = [] self[lang].append(string) - def __str__(self): + def __str__(self) -> str: return self.getfirst() or next(iter(self.values()))[0] - def getfirst(self, lang=None): + def getfirst(self, lang: Optional[str] = None) -> Optional[str]: + """Return the first string specified for the given language tag.""" return self.get(lang, [None])[0] -def valid_id_property(v): - if not isinstance(v, str): - warnings.warn('Inconsistent link property') - return None - if v.startswith('_'): - raise ValueError('Invalid @id property: {}'.format(v)) - return v - - -def valid_common_property(v): - if isinstance(v, dict): - if not {k[1:] for k in v if k.startswith('@')}.issubset( - {'id', 'language', 'type', 'value'}): - raise ValueError( - "Aside from @value, @type, @language, and @id, the properties used on an object " - "MUST NOT start with @.") - if '@value' in v: - if len(v) > 1: - if len(v) > 2 \ - or set(v.keys()) not in [{'@value', '@language'}, {'@value', '@type'}] \ - or not isinstance(v['@value'], (str, bool, int, decimal.Decimal)): - raise ValueError( - "If a @value property is used on an object, that object MUST NOT have " - "any other properties aside from either @type or @language, and MUST " - "NOT have both @type and @language as properties. The value of the " - "@value property MUST be a string, number, or boolean value.") - if '@language' in v and '@value' not in v: - raise ValueError( - "A @language property MUST NOT be used on an object unless it also has a " - "@value property.") - if '@id' in v: - v['@id'] = valid_id_property(v['@id']) - if '@language' in v: - if not (isinstance(v['@language'], str) and tags.check(v['@language'])): - warnings.warn('Invalid language tag') - del v['@language'] - if '@type' in v: - vv = v['@type'] - if isinstance(vv, str): - if vv.startswith('_:'): - raise ValueError( - 'The value of any @id or @type contained within a metadata document ' - 'MUST NOT be a blank node.') - if not is_url(vv) and \ - not any(vv == ns or vv.startswith(ns + ':') for ns in NAMESPACES) and \ - vv not in CSVW_TERMS: - raise ValueError( - 'The value of any member of @type MUST be either a term defined in ' - '[csvw-context], a prefixed name where the prefix is a term defined in ' - '[csvw-context], or an absolute URL.') - elif not isinstance(vv, (list, dict)): - raise ValueError('Invalid datatype for @type') - return {k: valid_common_property(vv) for k, vv in v.items()} - if isinstance(v, list): - return [valid_common_property(vv) for vv in v] - return v - - @dataclasses.dataclass -class DescriptionBase: - """Container for - - common properties (see http://w3c.github.io/csvw/metadata/#common-properties) - - @-properties. - """ - - common_props: dict[str, Any] = dataclasses.field(default_factory=dict) - at_props: dict[str, Any] = dataclasses.field(default_factory=dict) - - @classmethod - def partition_properties( - cls, - d: Union[dict, Any], - type_name: Optional[str] = None, - strict=True - ) -> Union[dict, None]: - if d and not isinstance(d, dict): - return - fields = {f.name: f for f in dataclasses.fields(cls)} - type_name = type_name or cls.__name__ - c, a, dd = {}, {}, {} - for k, v in (d or {}).items(): - if k.startswith('@'): - if k == '@id': - v = valid_id_property(v) - if k == '@type' and v != type_name: - raise ValueError('Invalid @type property {} for {}'.format(v, type_name)) - a[k[1:]] = v - elif ':' in k: - c[k] = valid_common_property(v) - else: - if strict and (k not in fields): - warnings.warn('Invalid property {} for {}'.format(k, type_name)) - else: - dd[k] = v - return dict(common_props=c, at_props=a, **dd) - - @classmethod - def fromvalue(cls, d: dict): - return cls(**cls.partition_properties(d)) - - def _iter_dict_items(self, omit_defaults): - def _asdict_single(v): - return v.asdict(omit_defaults=omit_defaults) if hasattr(v, 'asdict') else v - - def _asdict_multiple(v): - if isinstance(v, (list, tuple)): - return [_asdict_single(vv) for vv in v] - return _asdict_single(v) - - for k, v in sorted(self.at_props.items()): - yield '@' + k, _asdict_multiple(v) - - for k, v in sorted(self.common_props.items()): - yield k, _asdict_multiple(v) - - for k, v in utils.attr_asdict(self, omit_defaults=omit_defaults).items(): - if k not in ('common_props', 'at_props'): - yield k, _asdict_multiple(v) - - def asdict(self, omit_defaults=True) -> dict: - # Note: The `null` property is the only inherited, list-valued property where the default - # is not the empty list. Thus, to allow setting it to empty, we must treat `null` as - # special case here. - # See also https://www.w3.org/TR/tabular-metadata/#dfn-inherited-property - return collections.OrderedDict( - (k, v) for k, v in self._iter_dict_items(omit_defaults) - if (k == 'null' or (v not in ([], {})))) - - -@dataclasses.dataclass -class Datatype(DescriptionBase): +class Datatype(DescriptionBase): # pylint: disable=too-many-instance-attributes """ A datatype description @@ -495,18 +219,17 @@ class Datatype(DescriptionBase): .. seealso:: ``_ """ - base: str = None format: Optional[str] = None length: Optional[int] = None - minLength: Optional[int] = None - maxLength: Optional[int] = None + minLength: Optional[int] = None # pylint: disable=C0103 + maxLength: Optional[int] = None # pylint: disable=C0103 minimum: OrderedType = None maximum: OrderedType = None - minInclusive: Optional[bool] = None - maxInclusive: Optional[bool] = None - minExclusive: Optional[bool] = None - maxExclusive: Optional[bool] = None + minInclusive: Optional[bool] = None # pylint: disable=C0103 + maxInclusive: Optional[bool] = None # pylint: disable=C0103 + minExclusive: Optional[bool] = None # pylint: disable=C0103 + maxExclusive: Optional[bool] = None # pylint: disable=C0103 def __post_init__(self): self.base = functools.partial( @@ -515,21 +238,20 @@ def __post_init__(self): 'string', allow_none=True, cond=lambda ss: ss is None or ss in DATATYPES)(self.base) - for att in ('length', 'maxLength', 'minLength'): - setattr(self, att, utils.optional(int)(getattr(self, att))) - for attr_ in [ - 'minimum', 'maximum', 'minInclusive', 'maxInclusive', 'minExclusive', 'maxExclusive' - ]: - if getattr(self, attr_) is not None: - setattr(self, attr_, self.parse(getattr(self, attr_))) + self._set_constraints() + self._validate_constraints() - if self.length is not None: - if self.minLength is not None and self.length < self.minLength: - raise ValueError('minLength > length') + def _validate_constraints(self): + def error_if(msg, *conditions): + if any(conditions): + raise ValueError(msg) - if self.maxLength is not None: - if self.length > self.maxLength: - raise ValueError('maxLength < length') + if self.length is not None: + error_if( + 'Length limits interfere', + self.minLength is not None and self.length < self.minLength, + self.maxLength is not None and self.length > self.maxLength, + ) if self.minLength is not None and self.maxLength is not None \ and self.minLength > self.maxLength: @@ -541,34 +263,35 @@ def __post_init__(self): if not isinstance( self.basetype(), tuple((DATATYPES[name] for name in ['decimal', 'float', 'datetime', 'duration']))): - if any([getattr(self, at) for at in - 'minimum maximum minExclusive maxExclusive minInclusive maxInclusive'.split()]): - raise ValueError( - 'Applications MUST raise an error if minimum, minInclusive, maximum, ' - 'maxInclusive, minExclusive, or maxExclusive are specified and the base ' - 'datatype is not a numeric, date/time, or duration type.') + error_if( + 'Applications MUST raise an error if minimum, minInclusive, maximum, ' + 'maxInclusive, minExclusive, or maxExclusive are specified and the base ' + 'datatype is not a numeric, date/time, or duration type.', + *[getattr(self, at) for at in + 'minimum maximum minExclusive maxExclusive minInclusive maxInclusive'.split()]) if not isinstance( self.basetype(), (DATATYPES['string'], DATATYPES['base64Binary'], DATATYPES['hexBinary'])): - if self.length or self.minLength or self.maxLength: - raise ValueError( - 'Applications MUST raise an error if length, maxLength, or minLength are ' - 'specified and the base datatype is not string or one of its subtypes, or a ' - 'binary type.') - - if (self.minInclusive and self.minExclusive) or (self.maxInclusive and self.maxExclusive): - raise ValueError( - 'Applications MUST raise an error if both minInclusive and minExclusive are ' - 'specified, or if both maxInclusive and maxExclusive are specified.') - - if (self.minInclusive and self.maxExclusive and self.maxExclusive <= self.minInclusive) or \ - (self.minInclusive and self.maxInclusive and self.maxInclusive < self.minInclusive): - raise ValueError('') - - if (self.minExclusive and self.maxExclusive and self.maxExclusive <= self.minExclusive) or ( - self.minExclusive and self.maxInclusive and self.maxInclusive <= self.minExclusive): - raise ValueError('') + error_if( + 'Applications MUST raise an error if length, maxLength, or minLength are ' + 'specified and the base datatype is not string or one of its subtypes, or a ' + 'binary type.', + self.length, self.minLength, self.maxLength) + + error_if( + 'Applications MUST raise an error if both minInclusive and minExclusive are ' + 'specified, or if both maxInclusive and maxExclusive are specified.', + self.minInclusive and self.minExclusive, + self.maxInclusive and self.maxExclusive, + ) + error_if( + 'Limits overlap', + self.minInclusive and self.maxExclusive and self.maxExclusive <= self.minInclusive, + self.minInclusive and self.maxInclusive and self.maxInclusive < self.minInclusive, + self.minExclusive and self.maxExclusive and self.maxExclusive <= self.minExclusive, + self.minExclusive and self.maxInclusive and self.maxInclusive <= self.minExclusive, + ) if 'id' in self.at_props and any( self.at_props['id'] == NAMESPACES['xsd'] + dt for dt in DATATYPES): @@ -580,25 +303,34 @@ def __post_init__(self): self.format = None warnings.warn('Invalid number pattern') + def _set_constraints(self): + for att in ('length', 'maxLength', 'minLength'): + setattr(self, att, utils.optional(int)(getattr(self, att))) + for attr_ in [ + 'minimum', 'maximum', 'minInclusive', 'maxInclusive', 'minExclusive', 'maxExclusive' + ]: + if getattr(self, attr_) is not None: + setattr(self, attr_, self.parse(getattr(self, attr_))) + @classmethod - def fromvalue(cls, v: Union[str, dict, 'Datatype']) -> 'Datatype': + def fromvalue(cls, d: Union[str, dict, 'Datatype']) -> 'Datatype': """ :param v: Initialization data for `cls`; either a single string that is the main datatype \ of the values of the cell or a datatype description object, i.e. a `dict` or a `cls` \ instance. :return: An instance of `cls` """ - if isinstance(v, str): - return cls(base=v) + if isinstance(d, str): + return cls(base=d) - if isinstance(v, dict): - v.setdefault('base', 'string') - return cls(**cls.partition_properties(v)) + if isinstance(d, dict): + d.setdefault('base', 'string') + return cls(**cls.partition_properties(d)) - if isinstance(v, cls): - return v + if isinstance(d, cls): + return d - raise ValueError(v) + raise ValueError(d) def asdict(self, omit_defaults=True): res = DescriptionBase.asdict(self, omit_defaults=omit_defaults) @@ -612,55 +344,59 @@ def asdict(self, omit_defaults=True): return res @property - def basetype(self): + def basetype(self) -> type: return DATATYPES[self.base] @property - def derived_description(self): + def derived_description(self) -> dict: return self.basetype.derived_description(self) - def formatted(self, v): + def formatted(self, v: Any) -> str: + """Format a value as string.""" return self.basetype.to_string(v, **self.derived_description) - def parse(self, v): + def parse(self, v: str) -> Any: + """Parse a string value into a Python type.""" if v is None: return v return self.basetype.to_python(v, **self.derived_description) - def validate(self, v): + def validate(self, v: T) -> T: + """Make sure the datatype-level constraints are met.""" if v is None: return v try: l_ = len(v or '') if self.length is not None and l_ != self.length: - raise ValueError('value must have length {}'.format(self.length)) + raise ValueError(f'value must have length {self.length}') if self.minLength is not None and l_ < self.minLength: - raise ValueError('value must have at least length {}'.format(self.minLength)) + raise ValueError(f'value must have at least length {self.minLength}') if self.maxLength is not None and l_ > self.maxLength: - raise ValueError('value must have at most length {}'.format(self.maxLength)) + raise ValueError(f'value must have at most length {self.maxLength}') except TypeError: pass if self.basetype.minmax: if self.minimum is not None and v < self.minimum: - raise ValueError('value must be >= {}'.format(self.minimum)) + raise ValueError(f'value must be >= {self.minimum}') if self.minInclusive is not None and v < self.minInclusive: - raise ValueError('value must be >= {}'.format(self.minInclusive)) + raise ValueError(f'value must be >= {self.minInclusive}') if self.minExclusive is not None and v <= self.minExclusive: - raise ValueError('value must be > {}'.format(self.minExclusive)) + raise ValueError(f'value must be > {self.minExclusive}') if self.maximum is not None and v > self.maximum: - raise ValueError('value must be <= {}'.format(self.maximum)) + raise ValueError(f'value must be <= {self.maximum}') if self.maxInclusive is not None and v > self.maxInclusive: - raise ValueError('value must be <= {}'.format(self.maxInclusive)) + raise ValueError(f'value must be <= {self.maxInclusive}') if self.maxExclusive is not None and v >= self.maxExclusive: - raise ValueError('value must be < {}'.format(self.maxExclusive)) + raise ValueError(f'value must be < {self.maxExclusive}') return v - def read(self, v): + def read(self, v: str) -> Any: + """Read a value according to the spec of the Datatype.""" return self.validate(self.parse(v)) @dataclasses.dataclass -class Description(DescriptionBase): +class Description(DescriptionBase): # pylint: disable=R0902 """Adds support for inherited properties. .. seealso:: http://w3c.github.io/csvw/metadata/#inherited-properties @@ -672,17 +408,18 @@ class Description(DescriptionBase): # considered equal. _parent: Optional[DescriptionBase] = None - aboutUrl: Optional[Union[URITemplate, Invalid]] = None + aboutUrl: Optional[Union[URITemplate, Invalid]] = None # pylint: disable=C0103 datatype: Optional[Datatype] = None default: Optional[Union[str, list[str]]] = "" lang: str = "und" null: list[str] = dataclasses.field(default_factory=lambda: [""]) ordered: Optional[bool] = None - propertyUrl: Optional[Union[URITemplate, Invalid]] = None + propertyUrl: Optional[Union[URITemplate, Invalid]] = None # pylint: disable=C0103 required: Optional[bool] = None separator: Optional[str] = None - textDirection: Optional[Literal["ltr", "rtl", "auto", "inherit"]] = None - valueUrl: Optional[Union[URITemplate, Invalid]] = None + textDirection: Optional[ # pylint: disable=C0103 + Literal["ltr", "rtl", "auto", "inherit"]] = None + valueUrl: Optional[Union[URITemplate, Invalid]] = None # pylint: disable=C0103 def __post_init__(self): if self.datatype is not None: @@ -709,14 +446,22 @@ def __post_init__(self): if getattr(self, att) is not None: setattr(self, att, convert_uri_template(getattr(self, att))) - def inherit(self, attr): + def inherit(self, attr) -> Optional[Any]: + """ + The implementation of the inheritance mechanism. + + The chain of inheritance is established by assigning a an object to `_parent`. If this + object has a method `inherit` as well (i.e. is derived from Description), the chain + may continue. + """ v = getattr(self, attr) if v is None and self._parent: return self._parent.inherit(attr) if hasattr(self._parent, 'inherit') \ else getattr(self._parent, attr) return v - def inherit_null(self): + def inherit_null(self) -> list[str]: + """Inheritance of null is a special case due to the default value not being None.""" if self.null == [""]: if self._parent and hasattr(self._parent, 'inherit_null'): return self._parent.inherit_null() @@ -735,7 +480,7 @@ class Column(Description): .. seealso:: ``_ """ name: str = None - suppressOutput: bool = False + suppressOutput: bool = False # pylint: disable=C0103 titles: Optional[NaturalLanguage] = None virtual: bool = False _number: Optional[int] = None @@ -755,14 +500,17 @@ def __post_init__(self): self.virtual = utils.converter(bool, False, self.virtual) def __str__(self): - return self.name or \ - (self.titles and self.titles.getfirst()) or \ - '_col.{}'.format(self._number) + return self.name or (self.titles and self.titles.getfirst()) or f'_col.{self._number}' def __eq__(self, other): return self.asdict() == other.asdict() - def has_title(self, v): + def has_title(self, v) -> Union[str, bool]: + """ + Check whether the name or a title of the column matches v. + + If v matches a title, the associated language tag (or 'und') is returned. + """ if self.name and self.name == v: return True for tag, titles in (self.titles or {}).items(): @@ -771,10 +519,11 @@ def has_title(self, v): return False @property - def header(self): - return '{}'.format(self) + def header(self) -> str: # pylint: disable=missing-function-docstring + return f'{self}' - def read(self, v, strict=True): + def read(self, v: str, strict=True) -> Any: + """Convert a str to a Python object according to the spec for the column.""" required = self.inherit('required') null = self.inherit_null() default = self.inherit('default') @@ -789,18 +538,18 @@ def read(self, v, strict=True): warnings.warn('required column value is missing') raise ValueError('required column value is missing') - if separator: + if separator: # A list-valued column. if not v: - v = [] + v = [] # Empty string is interpreted as empty list. elif v in null: - v = None + v = None # A null value is interpreted as missing data. else: v = (vv or default for vv in v.split(separator)) v = [None if vv in null else vv for vv in v] elif v in null: - v = None + v = None # A null value. - if datatype: + if datatype: # Apply datatype conversion. if isinstance(v, list): try: return [datatype.read(vv) for vv in v] @@ -812,7 +561,8 @@ def read(self, v, strict=True): return datatype.read(v) return v - def write(self, v): + def write(self, v: Any) -> str: + """Convert v to a string according to the specifications for the column.""" sep = self.inherit('separator') null = self.inherit_null() datatype = self.inherit('datatype') @@ -831,18 +581,20 @@ def fmt(v): @dataclasses.dataclass class Reference: + """A reference specification as used to describe the targets of foreign keys.""" resource: Optional[Link] = None - schemaReference: Optional[Link] = None - columnReference: Optional[list[str]] = None + schemaReference: Optional[Link] = None # pylint: disable=C0103 + columnReference: Optional[list[str]] = None # pylint: disable=C0103 def __post_init__(self): if self.resource is not None: if self.schemaReference is not None: + # Either a local resource may be referenced or a schema - not both. raise ValueError(self) - self.resource = Link(self.resource) + self.resource = Link.from_value(self.resource) if self.schemaReference is not None: - self.schemaReference = Link(self.schemaReference) + self.schemaReference = Link.from_value(self.schemaReference) if isinstance(self.columnReference, str): self.columnReference = [self.columnReference] @@ -850,7 +602,8 @@ def __post_init__(self): @dataclasses.dataclass class ForeignKey: - columnReference: Optional[list[str]] = None + """A specification of a foreign key.""" + columnReference: Optional[list[str]] = None # pylint: disable=C0103 reference: Optional[Reference] = None def __post_init__(self): @@ -859,19 +612,20 @@ def __post_init__(self): @classmethod def fromdict(cls, d): + """Instantiate an object from a dict as returned by parsing the JSON metadata.""" if isinstance(d, dict): try: _ = Reference(**d['reference']) - except TypeError: - raise ValueError('Invalid reference property') + except TypeError as e: + raise ValueError('Invalid reference property') from e if not set(d.keys()).issubset({'columnReference', 'reference'}): raise ValueError('Invalid foreignKey spec') kw = dict(d, reference=Reference(**d['reference'])) return cls(**kw) - def asdict(self, **kw): - res = utils.attr_asdict(self, **kw) - res['reference'] = utils.attr_asdict(res['reference'], **kw) + def asdict(self, **kw) -> dict[str, Any]: # pylint: disable=C0116 + res = dataclass_asdict(self, **kw) + res['reference'] = dataclass_asdict(res['reference'], **kw) return res @@ -887,9 +641,9 @@ class Schema(Description): .. seealso:: ``_ """ columns: list[Column] = dataclasses.field(default_factory=list) - foreignKeys: list[ForeignKey] = dataclasses.field(default_factory=list) - primaryKey: Optional[list[str]] = None - rowTitles: list[str] = dataclasses.field(default_factory=list) + foreignKeys: list[ForeignKey] = dataclasses.field(default_factory=list) # pylint: disable=C0103 + primaryKey: Optional[list[str]] = None # pylint: disable=C0103 + rowTitles: list[str] = dataclasses.field(default_factory=list) # pylint: disable=C0103 def __post_init__(self): super().__post_init__() @@ -897,7 +651,7 @@ def __post_init__(self): Column.fromvalue(c) for c in utils.converter(dict, None, utils.converter(list, [], self.columns))] for i, col in enumerate(self.columns): - col._number = i + 1 + col._number = i + 1 # pylint: disable=protected-access if self.foreignKeys is None: self.foreignKeys = [] else: @@ -915,48 +669,55 @@ def __post_init__(self): virtual, seen, names = False, set(), set() for i, col in enumerate(self.columns): - if col.name and (col.name.startswith('_') or re.search(r'\s', col.name)): - warnings.warn('Invalid column name') - if col.virtual: # first virtual column sets the flag - virtual = True - elif virtual: # non-virtual column after virtual column! - raise ValueError('no non-virtual column allowed after virtual columns') - if not virtual: - if col.header in seen: - warnings.warn('Duplicate column name!') - if col.name: - if col.name in names: - raise ValueError('Duplicate column name {}'.format(col.name)) - names.add(col.name) - seen.add(col.header) - col._parent = self + virtual = self._check_col(col, virtual, names, seen) + col._parent = self # pylint: disable=protected-access for colref in self.primaryKey or []: col = self.columndict.get(colref) if col and not col.name: warnings.warn('A primaryKey referenced column MUST have a `name` property') self.primaryKey = None + def _check_col(self, col, virtual: bool, names: set[str], seen: set[str]) -> bool: + if col.name and (col.name.startswith('_') or re.search(r'\s', col.name)): + warnings.warn('Invalid column name') + if col.virtual: # first virtual column sets the flag + virtual = True + elif virtual: # non-virtual column after virtual column! + raise ValueError('no non-virtual column allowed after virtual columns') + if not virtual: + if col.header in seen: + warnings.warn('Duplicate column name!') + if col.name: + if col.name in names: + raise ValueError(f'Duplicate column name {col.name}') + names.add(col.name) + seen.add(col.header) + return virtual + @classmethod - def fromvalue(cls, v): - if isinstance(v, str): + def fromvalue(cls, d: Union[dict, str]) -> 'Schema': + """Instantiate a Schema from a dict or a URL to a JSON file.""" + if isinstance(d, str): try: # The schema is referenced with a URL - v = requests.get(v).json() - except: # pragma: no cover # noqa: E722 - return v - if not isinstance(v, dict): - if isinstance(v, int): + d = requests.get(d, timeout=10).json() + except: # pragma: no cover # noqa: E722 # pylint: disable=W0702 + return d + if not isinstance(d, dict): + if isinstance(d, int): warnings.warn('Invalid value for tableSchema property') - v = {} - return cls(**cls.partition_properties(v)) + d = {} + return cls(**cls.partition_properties(d)) @property - def columndict(self): + def columndict(self) -> dict[str, Column]: + """A table's columns mapped by header, i.e. normalized name.""" return {c.header: c for c in self.columns} - def get_column(self, name, strict=False): + def get_column(self, name: str, strict: bool = False) -> Optional[Column]: + """Resolve a Column by name, titles or propertyUrl.""" col = self.columndict.get(name) - assert (not strict) or (col and col.name) + assert (not strict) or (col and col.name), name if not col: for c in self.columns: if c.titles and c.titles.getfirst() == name: @@ -966,18 +727,6 @@ def get_column(self, name, strict=False): return col -def dialect_props(d): - if not isinstance(d, dict): - warnings.warn('Invalid dialect spec') - return {} - partitioned = Description.partition_properties(d, type_name='Dialect', strict=False) - del partitioned['at_props'] - del partitioned['common_props'] - if partitioned.get('headerRowCount'): - partitioned['header'] = True - return partitioned - - @dataclasses.dataclass class TableLike(Description): """ @@ -1007,8 +756,8 @@ class TableLike(Description): """ dialect: Optional[Union[str, Dialect]] = None notes: list[str] = dataclasses.field(default_factory=list) - tableDirection: Literal['rtl', 'ltr', 'auto'] = 'auto' - tableSchema: Optional[Schema] = None + tableDirection: Literal['rtl', 'ltr', 'auto'] = 'auto' # pylint: disable=invalid-name + tableSchema: Optional[Schema] = None # pylint: disable=invalid-name transformations: list = dataclasses.field(default_factory=list) url: Optional[Link] = None _fname: Union[str, pathlib.Path] = None # The path of the metadata file. @@ -1016,7 +765,8 @@ class TableLike(Description): def __post_init__(self): super().__post_init__() if isinstance(self.dialect, str): - self.dialect = Dialect(**dialect_props(get_json(Link(self.dialect).resolve(self.base)))) + self.dialect = Dialect( + **dialect_props(utils.get_json(Link(self.dialect).resolve(self.base)))) elif self.dialect is not None: self.dialect = Dialect(**dialect_props(self.dialect)) @@ -1027,31 +777,17 @@ def __post_init__(self): if not isinstance(self.transformations, list): warnings.warn('Invalid transformations property') for tr in self.transformations: - Description.partition_properties(tr, type_name='Template') + DescriptionBase.partition_properties(tr, type_name='Template') if self.url is not None: self.url = Link(self.url) - if self.tableSchema and not (isinstance(self.tableSchema, str)): - self.tableSchema._parent = self + if self.tableSchema and not isinstance(self.tableSchema, str): + self.tableSchema._parent = self # pylint: disable=protected-access if 'id' in self.at_props and self.at_props['id'] is None: self.at_props['id'] = self.base - ctx = self.at_props.get('context') - if isinstance(ctx, list): - for obj in ctx: - if (isinstance(obj, dict) and not set(obj.keys()).issubset({'@base', '@language'}))\ - or (isinstance(obj, str) and obj != 'http://www.w3.org/ns/csvw'): - raise ValueError( - 'The @context MUST have one of the following values: An array composed ' - 'of a string followed by an object, where the string is ' - 'http://www.w3.org/ns/csvw and the object represents a local context ' - 'definition, which is restricted to contain either or both of' - '@base and @language.') - if isinstance(obj, dict) and '@language' in obj: - if not tags.check(obj['@language']): - warnings.warn('Invalid value for @language property') - del obj['@language'] - - def get_column(self, spec): + valid_context_property(self.at_props.get('context')) + + def get_column(self, spec: str) -> Optional[Column]: # pylint: disable=C0116 return self.tableSchema.get_column(spec) if self.tableSchema else None @classmethod @@ -1061,7 +797,7 @@ def from_file(cls, fname: Union[str, pathlib.Path], data=None) -> 'TableLike': """ if is_url(str(fname)): return cls.from_url(str(fname), data=data) - res = cls.fromvalue(data or get_json(fname)) + res = cls.fromvalue(data or utils.get_json(fname)) res._fname = pathlib.Path(fname) return res @@ -1070,7 +806,7 @@ def from_url(cls, url: str, data=None) -> 'TableLike': """ Instantiate a CSVW Table or TableGroup description from a metadata file specified by URL. """ - data = data or get_json(url) + data = data or utils.get_json(url) url = urlparse(url) data.setdefault('@base', urlunparse((url.scheme, url.netloc, url.path, '', '', ''))) for table in data.get('tables', [data]): @@ -1089,7 +825,7 @@ def to_file(self, fname: Union[str, pathlib.Path], omit_defaults=True) -> pathli """ fname = utils.ensure_path(fname) data = self.asdict(omit_defaults=omit_defaults) - with json_open(str(fname), 'w') as f: + with utils.json_open(str(fname), 'w') as f: json.dump(data, f, indent=4, separators=(',', ': ')) return fname @@ -1110,8 +846,9 @@ def base(self) -> Union[str, pathlib.Path]: # **base URL** for other URLs in the metadata document. return Link(ctxbase).resolve(at_props['base']) return at_props['base'] - return self._parent._fname.parent if (self._parent and self._parent._fname) else \ - (self._fname.parent if self._fname else None) + if self._parent and self._parent._fname: # pylint: disable=protected-access + return self._parent._fname.parent # pylint: disable=protected-access + return self._fname.parent if self._fname else None # pylint: disable=protected-access def expand(self, tmpl: URITemplate, row: dict, _row, _name=None, qname=False) -> str: """ @@ -1135,7 +872,7 @@ def expand(self, tmpl: URITemplate, row: dict, _row, _name=None, qname=False) -> if tmpl.uri.startswith(prefix + ':'): # If the URI Template is a QName, we expand it to a URL to prevent `Link.resolve` # from turning it into a local path. - res = '{}{}'.format(url, tmpl.uri.split(':')[1]) + res = f"{url}{tmpl.uri.split(':')[1]}" break else: res = Link( @@ -1153,6 +890,50 @@ def expand(self, tmpl: URITemplate, row: dict, _row, _name=None, qname=False) -> return res +@dataclasses.dataclass(frozen=True) +class CsvRow: + """A bag of attributes specifying a row in a CSV file.""" + fname: str + lineno: int + row: list[str] + + +@dataclasses.dataclass +class RowParseSpec: + """A bag of attributes used when parsing a CSV row.""" + strict: bool + log: Optional[logging.Logger] + row_implementation: type = collections.OrderedDict + error: bool = False + + def log_error(self, msg: str): + """Log and record error.""" + utils.log_or_raise(msg, log=self.log) + self.error = True + + +@dataclasses.dataclass +class TableParseSpec: + """Some metadata, categorizing columns in a table.""" + colnames: list[str] = dataclasses.field(default_factory=list) + virtualcols: list[tuple[str, URITemplate]] = dataclasses.field(default_factory=list) + requiredcols: set[str] = dataclasses.field(default_factory=set) + + @classmethod + def from_columns(cls, columns: Iterable[Column]) -> 'TableParseSpec': + """Initialize from columns (e.g. columns property of Schema).""" + res = cls() + for col in columns: + if col.virtual: + if col.valueUrl: + res.virtualcols.append((col.header, col.valueUrl)) + else: + res.colnames.append(col.header) + if col.required: + res.requiredcols.add(col.header) + return res + + @dataclasses.dataclass class Table(TableLike): """ @@ -1168,7 +949,7 @@ class Table(TableLike): .. seealso:: ``_ """ - suppressOutput: bool = False + suppressOutput: bool = False # pylint: disable=invalid-name _comments = [] def add_foreign_key(self, colref, ref_resource, ref_colref): @@ -1181,7 +962,7 @@ def add_foreign_key(self, colref, ref_resource, ref_colref): """ colref = [colref] if not isinstance(colref, (tuple, list)) else colref if not all(col in [c.name for c in self.tableSchema.columns] for col in colref): - raise ValueError('unknown column in foreignKey {0}'.format(colref)) + raise ValueError(f'unknown column in foreignKey {colref}') self.tableSchema.foreignKeys.append(ForeignKey.fromdict({ 'columnReference': colref, @@ -1195,6 +976,7 @@ def __post_init__(self): @property def local_name(self) -> Union[str, None]: + """The filename of a table.""" return self.url.string if self.url else None def _get_dialect(self) -> Dialect: @@ -1231,13 +1013,12 @@ def write(self, row = [col.write(item[i]) for i, col in enumerate(non_virtual_cols)] else: if strict: - add = set(item.keys()) - {'{}'.format(col) for col in non_virtual_cols} + add = set(item.keys()) - {f'{col}' for col in non_virtual_cols} if add: - raise ValueError("dict contains fields not in fieldnames: {}".format( - ', '.join("'{}'".format(field) for field in add))) + add = ', '.join(f"'{field}'" for field in add) + raise ValueError(f"dict contains fields not in fieldnames: {add}") row = [ - col.write(item.get( - col.header, item.get('{}'.format(col)))) + col.write(item.get(col.header, item.get(f'{col}'))) for col in non_virtual_cols] rowcount += 1 writer.writerow(row) @@ -1255,6 +1036,7 @@ def write(self, return rowcount def check_primary_key(self, log=None, items=None) -> bool: + """Make sure primary keys are unique.""" success = True if items is not None: warnings.warn('the items argument of check_primary_key ' @@ -1263,12 +1045,10 @@ def check_primary_key(self, log=None, items=None) -> bool: get_pk = operator.itemgetter(*self.tableSchema.primaryKey) seen = set() # Read all rows in the table, ignoring errors: - for fname, lineno, row in self.iterdicts(log=nolog(), with_metadata=True): + for fname, lineno, row in self.iterdicts(log=utils.nolog(), with_metadata=True): pk = get_pk(row) if pk in seen: - log_or_raise( - '{0}:{1} duplicate primary key: {2}'.format(fname, lineno, pk), - log=log) + utils.log_or_raise(f'{fname}:{lineno} duplicate primary key: {pk}', log=log) success = False else: seen.add(pk) @@ -1277,14 +1057,123 @@ def check_primary_key(self, log=None, items=None) -> bool: def __iter__(self): return self.iterdicts() - def iterdicts( + def _get_csv_reader(self, fname, dialect, stack) -> UnicodeReaderWithLineNumber: + if is_url(fname): + handle = io.TextIOWrapper( + io.BytesIO(requests.get(str(fname), timeout=10).content), encoding=dialect.encoding) + else: + handle = fname + fpath = pathlib.Path(fname) + if not fpath.exists(): + zipfname = fpath.parent.joinpath(fpath.name + '.zip') + if zipfname.exists(): + zipf = stack.enter_context(zipfile.ZipFile(zipfname)) # pylint: disable=R1732 + handle = io.TextIOWrapper( + zipf.open([n for n in zipf.namelist() if n.endswith(fpath.name)][0]), + encoding=dialect.encoding) + + return stack.enter_context(UnicodeReaderWithLineNumber(handle, dialect=dialect)) + + def _validated_csv_header(self, header, strict) -> list[str]: + if not strict: + if self.tableSchema.columns and len(self.tableSchema.columns) < len(header): + warnings.warn('Column number mismatch') + for name, col in zip(header, self.tableSchema.columns): + res = col.has_title(name) + if (not col.name) and not res: + warnings.warn('Incompatible table models') + if (isinstance(res, str) and # noqa: W504 + res.split('-')[0] not in ['und', (self.lang or 'und').split('-')[0]]): + warnings.warn('Incompatible column titles') + return header + + def _read_row( + self, + row: CsvRow, + parse_spec: RowParseSpec, + header_cols: list[tuple[int, str, Column]], + spec: TableParseSpec, + ) -> RowType: + required = {h: j for j, h, c in header_cols if c and c.required} + res = parse_spec.row_implementation() + + for (j, k, col), v in zip(header_cols, row.row): + # see http://w3c.github.io/csvw/syntax/#parsing-cells + if col: + try: + res[col.header] = col.read(v, strict=parse_spec.strict) + except ValueError as e: + if not parse_spec.strict: + warnings.warn(f'Invalid column value: {v} {col.datatype}; {e}') + res[col.header] = v + else: + parse_spec.log_error(f'{row.fname}:{row.lineno}:{j + 1} {k}: {e}') + if k in required: + del required[k] + else: + if parse_spec.strict: + warnings.warn(f'Unspecified column "{k}" in table {self.local_name}') + res[k] = v + + for k, j in required.items(): + if k not in res: + parse_spec.log_error( + f'{row.fname}:{row.lineno}:{j + 1} {k}: required column value is missing') + + # Augment result with regular columns not provided in the data: + for key in spec.colnames: + res.setdefault(key, None) + + # Augment result with virtual columns: + for key, value_url in spec.virtualcols: + res[key] = value_url.expand(**res) + return res + + def _get_header_cols( + self, + header: list[str], + colnames: list[str], + strict: bool, + row: Iterable, + ) -> list[tuple[int, str, Column]]: + def default_col(index): + return Column.fromvalue({'name': f'_col.{index}'}) + + # If columns in the data are ordered as in the spec, we can match values to + # columns by index, rather than looking up columns by name. + if (header == colnames) or \ + (len(self.tableSchema.columns) >= len(header) and not strict): + # Note that virtual columns are only allowed to come **after** regular ones, + # so we can simply zip the whole columns list, and silently ignore surplus + # virtual columns. + header_cols = list(zip(header, self.tableSchema.columns)) + elif not strict and self.tableSchema.columns and \ + (len(self.tableSchema.columns) < len(header)): + header_cols = [] + for i, cname in enumerate(header): + try: + header_cols.append((cname, self.tableSchema.columns[i])) + except IndexError: + col = default_col(i + 1) + header_cols.append((col.name, col)) + else: + header_cols = [(h, self.tableSchema.get_column(h)) for h in header] + + if not header_cols: + for i, _ in enumerate(row): + col = default_col(i + 1) + header_cols.append((col.name, col)) + + return [(j, h, c) for j, (h, c) in enumerate(header_cols)] + + def iterdicts( # pylint: disable=too-many-locals self, - log=None, - with_metadata=False, + log: Optional[logging.Logger] = None, + with_metadata: bool = False, fname=None, - _Row=collections.OrderedDict, + _Row: type = collections.OrderedDict, # pylint: disable=invalid-name strict=True, - ) -> Generator[dict, None, None]: + ) -> Generator[Union[dict[str, Any], tuple[str, int, dict[str, Any]]], None, None]: """Iterate over the rows of the table Create an iterator that maps the information in each row to a `dict` whose keys are @@ -1307,136 +1196,70 @@ def iterdicts( """ dialect = self._get_dialect() fname = fname or self.url.resolve(self.base) - colnames, virtualcols, requiredcols = [], [], set() - for col in self.tableSchema.columns: - if col.virtual: - if col.valueUrl: - virtualcols.append((col.header, col.valueUrl)) - else: - colnames.append(col.header) - if col.required: - requiredcols.add(col.header) + + table_parse_spec = TableParseSpec.from_columns(self.tableSchema.columns) with contextlib.ExitStack() as stack: - if is_url(fname): - handle = io.TextIOWrapper( - io.BytesIO(requests.get(str(fname)).content), encoding=dialect.encoding) - else: - handle = fname - fpath = pathlib.Path(fname) - if not fpath.exists(): - zipfname = fpath.parent.joinpath(fpath.name + '.zip') - if zipfname.exists(): - zipf = stack.enter_context(zipfile.ZipFile(str(zipfname))) - handle = io.TextIOWrapper( - zipf.open([n for n in zipf.namelist() if n.endswith(fpath.name)][0]), - encoding=dialect.encoding) - - reader = stack.enter_context(UnicodeReaderWithLineNumber(handle, dialect=dialect)) - reader = iter(reader) + reader = iter(self._get_csv_reader(fname, dialect, stack)) # If the data file has a header row, this row overrides the header as # specified in the metadata. if dialect.header: try: - _, header = next(reader) - if not strict: - if self.tableSchema.columns and len(self.tableSchema.columns) < len(header): - warnings.warn('Column number mismatch') - for name, col in zip(header, self.tableSchema.columns): - res = col.has_title(name) - if (not col.name) and not res: - warnings.warn('Incompatible table models') - if isinstance(res, str) and res.split('-')[0] not in [ - 'und', (self.lang or 'und').split('-')[0]]: - warnings.warn('Incompatible column titles') + header = self._validated_csv_header(next(reader)[1], strict) except StopIteration: # pragma: no cover return else: - header = colnames - - # If columns in the data are ordered as in the spec, we can match values to - # columns by index, rather than looking up columns by name. - if (header == colnames) or \ - (len(self.tableSchema.columns) >= len(header) and not strict): - # Note that virtual columns are only allowed to come **after** regular ones, - # so we can simply zip the whole columns list, and silently ignore surplus - # virtual columns. - header_cols = list(zip(header, self.tableSchema.columns)) - elif not strict and self.tableSchema.columns and \ - (len(self.tableSchema.columns) < len(header)): - header_cols = [] - for i, cname in enumerate(header): - try: - header_cols.append((cname, self.tableSchema.columns[i])) - except IndexError: - header_cols.append(( - '_col.{}'.format(i + 1), - Column.fromvalue({'name': '_col.{}'.format(i + 1)}))) - else: - header_cols = [(h, self.tableSchema.get_column(h)) for h in header] - header_cols = [(j, h, c) for j, (h, c) in enumerate(header_cols)] - missing = requiredcols - set(c.header for j, h, c in header_cols if c) - if missing: - raise ValueError('{0} is missing required columns {1}'.format(fname, missing)) - - for lineno, row in reader: - required = {h: j for j, h, c in header_cols if c and c.required} - res = _Row() - error = False - if (not header_cols) and row: - header_cols = [ - (i, - '_col.{}'.format(i + 1), - Column.fromvalue({'name': '_col.{}'.format(i + 1)})) - for i, _ in enumerate(row)] - for (j, k, col), v in zip(header_cols, row): - # see http://w3c.github.io/csvw/syntax/#parsing-cells - if col: - try: - res[col.header] = col.read(v, strict=strict) - except ValueError as e: - if not strict: - warnings.warn( - 'Invalid column value: {} {}; {}'.format(v, col.datatype, e)) - res[col.header] = v - else: - log_or_raise( - '{0}:{1}:{2} {3}: {4}'.format(fname, lineno, j + 1, k, e), - log=log) - error = True - if k in required: - del required[k] - else: - if strict: - warnings.warn( - 'Unspecified column "{0}" in table {1}'.format(k, self.local_name)) - res[k] = v - - for k, j in required.items(): - if k not in res: - log_or_raise( - '{0}:{1}:{2} {3}: {4}'.format( - fname, lineno, j + 1, k, 'required column value is missing'), - log=log) - error = True - - # Augment result with regular columns not provided in the data: - for key in colnames: - res.setdefault(key, None) - - # Augment result with virtual columns: - for key, valueUrl in virtualcols: - res[key] = valueUrl.expand(**res) - - if not error: - if with_metadata: - yield fname, lineno, res - else: - yield res + header = table_parse_spec.colnames + + header_cols = None + for i, (lineno, row) in enumerate(reader): + if i == 0: + header_cols = self._get_header_cols( + header, table_parse_spec.colnames, strict, row) + missing = table_parse_spec.requiredcols - \ + {c.header for j, h, c in header_cols if c} + if missing: + raise ValueError(f'{fname} is missing required columns {missing}') + + parse_spec = RowParseSpec(strict=strict, log=log, row_implementation=_Row) + res = self._read_row( + CsvRow(fname=fname, lineno=lineno, row=row), + parse_spec, + header_cols, + table_parse_spec, + ) + if not parse_spec.error: + yield (fname, lineno, res) if with_metadata else res self._comments = reader.comments +@dataclasses.dataclass(frozen=True) +class ForeignKeyInstance: + """Simple structure holding the specification of a foreign key.""" + target_table: Table + pk: ColRefType + source_table: Table + fk: ColRefType + + def validate(self, strict: bool) -> None: + """Checks whether the colrefs for fk and pk match.""" + if len(self.fk) != len(self.pk): + raise ValueError( + 'Foreign key error: non-matching number of columns in source and target') + for scol, tcol in zip(self.fk, self.pk): + scolumn = self.source_table.tableSchema.get_column(scol, strict=strict) + tcolumn = self.target_table.tableSchema.get_column(tcol, strict=strict) + if not (scolumn and tcolumn): + raise ValueError( + f'Foreign key error: missing column "{scol}" or "{tcol}"') + if scolumn.datatype and tcolumn.datatype and \ + scolumn.datatype.base != tcolumn.datatype.base: + raise ValueError( + f'Foregin key error: non-matching datatype "{scol}:{scolumn.datatype.base}" ' + f'or "{tcol}:{tcolumn.datatype.base}"') + + @dataclasses.dataclass class TableGroup(TableLike): """ @@ -1464,10 +1287,11 @@ def __post_init__(self): self.tables = res super().__post_init__() for table in self.tables: - table._parent = self + table._parent = self # pylint: disable=protected-access @classmethod def from_frictionless_datapackage(cls, dp): + """Initialize a TableGroup from a frictionless DataPackage.""" return DataPackage(dp).to_tablegroup(cls) def read(self): @@ -1509,24 +1333,30 @@ def copy(self, dest: Union[pathlib.Path, str]): @property def tabledict(self) -> dict[str, Table]: + """Convenient access to tables by name.""" return {t.local_name: t for t in self.tables} - def foreign_keys(self) -> list[tuple[Table, ColRefType, Table, ColRefType]]: - return [ - ( - self.tabledict[fk.reference.resource.string], - tuple(fk.reference.columnReference), - t, - tuple(fk.columnReference)) - for t in self.tables for fk in t.tableSchema.foreignKeys - if not fk.reference.schemaReference] - - def validate_schema(self, strict=False): + def validate_schema(self, strict: bool = False) -> list[ForeignKeyInstance]: + """Check whether pk and fk specs in foreign key constraints match.""" try: - for fki in [ForeignKeyInstance(*fk) for fk in self.foreign_keys()]: + fkis = sorted( + [ + ForeignKeyInstance( + self.tabledict[fk.reference.resource.string], + tuple(fk.reference.columnReference), + t, + tuple(fk.columnReference)) + for t in self.tables for fk in t.tableSchema.foreignKeys + if not fk.reference.schemaReference], + key=lambda x: (x.target_table.local_name, x.pk, x.source_table.local_name)) + except KeyError as e: + raise ValueError(f'Foreign key error: missing table "{e}" referenced') from e + try: + for fki in fkis: fki.validate(strict=strict) - except (KeyError, AssertionError) as e: - raise ValueError('Foreign key error: missing table "{}" referenced'.format(e)) + except AssertionError as e: + raise ValueError(f'Foreign key error: missing column "{e}" referenced') from e + return fkis def check_referential_integrity(self, data=None, log=None, strict=False): """ @@ -1540,17 +1370,19 @@ def check_referential_integrity(self, data=None, log=None, strict=False): for fk in t.tableSchema.foreignKeys: for row in t: if any(row.get(col) is None for col in fk.columnReference): - raise ValueError('Foreign key column is null: {} {}'.format( - [row.get(col) for col in fk.columnReference], fk.columnReference)) + raise ValueError( + f'Foreign key column is null: ' + f'{[row.get(col) for col in fk.columnReference]} ' + f'{fk.columnReference}') try: - self.validate_schema() + fkis = self.validate_schema() success = True except ValueError as e: + fkis = [] success = False - log_or_raise(str(e), log=log, level='error') + utils.log_or_raise(str(e), log=log, level='error') - fkeys = [ForeignKeyInstance(*fk) for fk in self.foreign_keys()] - # FIXME: We only support Foreign Key references between tables! + # FIXME: We only support Foreign Key references between tables! pylint: disable=W0511 # We group foreign key constraints by target table, because we only want to read the # available primary keys once and then check all tables referencing the target table in # a loop. @@ -1558,10 +1390,7 @@ def check_referential_integrity(self, data=None, log=None, strict=False): # Grouping by local_name of tables - even though we'd like to have the table objects # around, too. This it to prevent going down the rabbit hole of comparing table objects # for equality, when comparison of the string names is enough. - fkeys = sorted( - fkeys, - key=lambda x: (x.target_table.local_name, x.pk, x.source_table.local_name)) - for _, grp in itertools.groupby(fkeys, lambda x: x.target_table.local_name): + for _, grp in itertools.groupby(fkis, lambda x: x.target_table.local_name): grp = list(grp) target_table = grp[0].target_table fks = collections.OrderedDict() @@ -1618,28 +1447,29 @@ def get_pks(self, success: bool, strict: bool) -> bool: self.refs[pk].add(vals) return success - def _check_item(self, success: bool, ref: 'RefValues', pk: ColRefType) -> bool: + def _check_item(self, success: bool, vals: 'RefValues', pk: ColRefType) -> bool: """ We check if the value for the foreign key are available in the referenced table. """ pks = self.refs[pk] single_column = self.pks[pk] - if ref.values is None: # null-valued foreign key. + if vals.values is None: # null-valued foreign key. return success - if single_column and isinstance(ref.values, list): + if single_column and isinstance(vals.values, list): # We allow list-valued columns as foreign key columns in case it's not a composite key. # If a foreign key is list-valued, we check for a matching row for each of the values # in the list. - refs = ref.values + refs = vals.values else: - refs = [ref.values] + refs = [vals.values] for ref in refs: if not single_column and None in ref: # pragma: no cover # A composite key and one component of the fk is null? - # TODO: raise if any(c is not None for c in values)? + # TODO: raise if any(c is not None for c in values)? pylint: disable=W0511 continue if ref not in pks: - log_or_raise(f'{ref} not found in table {self.table.url.string}', log=self.log) + utils.log_or_raise( + f'{vals} not found in table {self.table.url.string}', log=self.log) success = False return success @@ -1660,31 +1490,6 @@ def check_fks( return success -@dataclasses.dataclass(frozen=True) -class ForeignKeyInstance: - """Simple structure holding the specification of a foreign key.""" - target_table: Table - pk: tuple[str] - source_table: Table - fk: tuple[str] - - def validate(self, strict: bool): - if len(self.fk) != len(self.pk): - raise ValueError( - 'Foreign key error: non-matching number of columns in source and target') - for scol, tcol in zip(self.fk, self.pk): - scolumn = self.source_table.tableSchema.get_column(scol, strict=strict) - tcolumn = self.target_table.tableSchema.get_column(tcol, strict=strict) - if not (scolumn and tcolumn): - raise ValueError( - f'Foreign key error: missing column "{scol}" or "{tcol}"') - if scolumn.datatype and tcolumn.datatype and \ - scolumn.datatype.base != tcolumn.datatype.base: - raise ValueError( - 'Foregin key error: non-matching datatype "{}:{}" or "{}:{}"'.format( - scol, scolumn.datatype.base, tcol, tcolumn.datatype.base)) - - @dataclasses.dataclass(frozen=True) class RefValues: """Bundle properties of a table row for simpler checking.""" @@ -1709,7 +1514,7 @@ def __init__(self, url: str, md_url: Optional[str] = None, validate: bool = Fals no_header = False try: - md = get_json(md_url or url) + md = utils.get_json(md_url or url) # The URL could be read as JSON document, thus, the user supplied us with overriding # metadata as per https://w3c.github.io/csvw/syntax/#overriding-metadata except json.decoder.JSONDecodeError: @@ -1787,7 +1592,8 @@ def locate_metadata(url=None) -> tuple[dict, bool]: """ def describes(md, url): for table in md.get('tables', [md]): - # FIXME: We check whether the metadata describes a CSV file just superficially, + # FIXME: pylint: disable=W0511 + # We check whether the metadata describes a CSV file just superficially, # by comparing the last path components of the respective URLs. if url.split('/')[-1] == table['url'].split('/')[-1]: return True @@ -1802,7 +1608,7 @@ def describes(md, url): desc = res.links.get('describedby') if desc and desc['type'] in [ "application/csvm+json", "application/ld+json", "application/json"]: - md = get_json(Link(desc['url']).resolve(url)) + md = utils.get_json(Link(desc['url']).resolve(url)) if describes(md, url): return md, no_header warnings.warn('Ignoring linked metadata because it does not reference the data') @@ -1829,7 +1635,7 @@ def describes(md, url): elif url: # Default Locations for local files: if pathlib.Path(str(url) + '-metadata.json').exists(): - return get_json(pathlib.Path(str(url) + '-metadata.json')), no_header + return utils.get_json(pathlib.Path(str(url) + '-metadata.json')), no_header res = { '@context': "http://www.w3.org/ns/csvw", 'url': url, @@ -1861,7 +1667,7 @@ def to_json(self, minimal=False): def _table_to_json(self, table): res = collections.OrderedDict() - # FIXME: id + # FIXME: id pylint: disable=W0511 res['url'] = str(table.url.resolve(table.base)) if 'id' in table.at_props: res['@id'] = table.at_props['id'] diff --git a/src/csvw/metadata_utils.py b/src/csvw/metadata_utils.py new file mode 100644 index 0000000..bb9534a --- /dev/null +++ b/src/csvw/metadata_utils.py @@ -0,0 +1,335 @@ +""" +Helpers to model CSVW metadata as dataclasses. +""" +import decimal +import warnings +import collections +from collections.abc import Generator +import dataclasses +from typing import Any, Optional, Union + +from language_tags import tags + +from .utils import is_url + +__all__ = ['valid_common_property', 'valid_id_property', 'valid_context_property', + 'DescriptionBase', 'dataclass_asdict', 'NAMESPACES', 'dialect_props'] + +NumberType = Union[int, float, decimal.Decimal] +NAMESPACES = { + 'csvw': 'http://www.w3.org/ns/csvw#', + 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', + 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', + 'xsd': 'http://www.w3.org/2001/XMLSchema#', + 'dc': 'http://purl.org/dc/terms/', + 'dcat': 'http://www.w3.org/ns/dcat#', + 'prov': 'http://www.w3.org/ns/prov#', + 'schema': 'http://schema.org/', + "as": "https://www.w3.org/ns/activitystreams#", + "cc": "http://creativecommons.org/ns#", + "ctag": "http://commontag.org/ns#", + "dc11": "http://purl.org/dc/elements/1.1/", + "dctypes": "http://purl.org/dc/dcmitype/", + "dqv": "http://www.w3.org/ns/dqv#", + "duv": "https://www.w3.org/ns/duv#", + "foaf": "http://xmlns.com/foaf/0.1/", + "gr": "http://purl.org/goodrelations/v1#", + "grddl": "http://www.w3.org/2003/g/data-view#", + "ical": "http://www.w3.org/2002/12/cal/icaltzd#", + "jsonld": "http://www.w3.org/ns/json-ld#", + "ldp": "http://www.w3.org/ns/ldp#", + "ma": "http://www.w3.org/ns/ma-ont#", + "oa": "http://www.w3.org/ns/oa#", + "odrl": "http://www.w3.org/ns/odrl/2/", + "og": "http://ogp.me/ns#", + "org": "http://www.w3.org/ns/org#", + "owl": "http://www.w3.org/2002/07/owl#", + "qb": "http://purl.org/linked-data/cube#", + "rdfa": "http://www.w3.org/ns/rdfa#", + "rev": "http://purl.org/stuff/rev#", + "rif": "http://www.w3.org/2007/rif#", + "rr": "http://www.w3.org/ns/r2rml#", + "sd": "http://www.w3.org/ns/sparql-service-description#", + "sioc": "http://rdfs.org/sioc/ns#", + "skos": "http://www.w3.org/2004/02/skos/core#", + "skosxl": "http://www.w3.org/2008/05/skos-xl#", + "sosa": "http://www.w3.org/ns/sosa/", + "ssn": "http://www.w3.org/ns/ssn/", + "time": "http://www.w3.org/2006/time#", + "v": "http://rdf.data-vocabulary.org/#", + "vcard": "http://www.w3.org/2006/vcard/ns#", + "void": "http://rdfs.org/ns/void#", + "wdr": "http://www.w3.org/2007/05/powder#", + "wrds": "http://www.w3.org/2007/05/powder-s#", + "xhv": "http://www.w3.org/1999/xhtml/vocab#", + "xml": "http://www.w3.org/XML/1998/namespace", +} +CSVW_TERMS = """Cell +Column +Datatype +Dialect +Direction +ForeignKey +JSON +NumericFormat +Row +Schema +Table +TableGroup +TableReference +Transformation +aboutUrl +base +columnReference +columns +commentPrefix +datatype +decimalChar +default +delimiter +describes +dialect +doubleQuote +encoding +foreignKeys +format +groupChar +header +headerRowCount +json +lang +length +lineTerminators +maxExclusive +maxInclusive +maxLength +maximum +minExclusive +minInclusive +minLength +minimum +name +notes +null +ordered +pattern +primaryKey +propertyUrl +quoteChar +reference +referencedRows +required +resource +row +rowTitles +rownum +schemaReference +scriptFormat +separator +skipBlankRows +skipColumns +skipInitialSpace +skipRows +source +suppressOutput +tableDirection +tableSchema +tables +targetFormat +textDirection +titles +transformations +trim +uriTemplate +url +valueUrl +virtual""".split() + + +def dataclass_asdict(obj, omit_defaults: bool = True, omit_private: bool = True) -> dict[str, Any]: + """Enhanced conversion of dataclass instances to a dict.""" + res = collections.OrderedDict() + for field in dataclasses.fields(obj): + default = field.default_factory() if callable(field.default_factory) else field.default + if not (omit_private and field.name.startswith('_')): + value = getattr(obj, field.name) + if not (omit_defaults and value == default): + if hasattr(value, 'asdict'): + value = value.asdict(omit_defaults=True) + res[field.name] = value + return res + + +def valid_id_property(v: str) -> Optional[str]: + """Validator for the @id property.""" + if not isinstance(v, str): + warnings.warn('Inconsistent link property') + return None + if v.startswith('_'): + raise ValueError(f'Invalid @id property: {v}') + return v + + +def valid_context_property(ctx): + nsurl = NAMESPACES['csvw'].replace('#', '') + if ctx is None: + return ctx + if isinstance(ctx, str): + assert ctx == nsurl + return ctx + assert isinstance(ctx, list), ctx + for obj in ctx: + if any((isinstance(obj, dict) and not set(obj.keys()).issubset({'@base', '@language'}), + isinstance(obj, str) and obj != nsurl)): + raise ValueError( + f'The @context MUST have one of the following values: An array composed of a ' + f'string followed by an object, where the string is {nsurl} and the ' + f'object represents a local context definition, which is restricted to contain ' + f'either or both of @base and @language.') + if isinstance(obj, dict) and '@language' in obj and not tags.check(obj['@language']): + warnings.warn('Invalid value for @language property') + del obj['@language'] + return ctx + + +def valid_common_property(v): # pylint: disable=too-many-branches + """Validator for values of common properties.""" + if not isinstance(v, (dict, list)): + # No JSON container types. We'll just assume all is good. + return v + + if isinstance(v, list): # Recurse into the items. + return [valid_common_property(vv) for vv in v] + + if not {k[1:] for k in v if k.startswith('@')}.issubset({'id', 'language', 'type', 'value'}): + raise ValueError( + "Aside from @value, @type, @language, and @id, the properties used on an object " + "MUST NOT start with @.") + if '@value' in v: + if any(( + len(v) > 2, + set(v.keys()) not in [{'@value', '@language'}, {'@value', '@type'}], + not isinstance(v['@value'], (str, bool, int, float, decimal.Decimal)) + )): + raise ValueError( + "If a @value property is used on an object, that object MUST NOT have any other " + "properties aside from either @type or @language, and MUST NOT have both @type and " + "@language as properties. The value of the @value property MUST be a string, " + "number, or boolean value.") + if '@language' in v and '@value' not in v: + raise ValueError( + "A @language property MUST NOT be used on an object unless it also has a @value " + "property.") + if '@id' in v: + v['@id'] = valid_id_property(v['@id']) + if '@language' in v: + if not (isinstance(v['@language'], str) and tags.check(v['@language'])): + warnings.warn('Invalid language tag') + del v['@language'] + if '@type' in v: + vv = v['@type'] + if isinstance(vv, str): + if vv.startswith('_:'): + raise ValueError( + 'The value of any @id or @type contained within a metadata document ' + 'MUST NOT be a blank node.') + if not any(( + is_url(vv), + any(vv == ns or vv.startswith(ns + ':') for ns in NAMESPACES), + vv in CSVW_TERMS + )): + raise ValueError( + 'The value of any member of @type MUST be either a term defined in ' + '[csvw-context], a prefixed name where the prefix is a term defined in ' + '[csvw-context], or an absolute URL.') + elif not isinstance(vv, (list, dict)): + raise ValueError('Invalid datatype for @type') + return {k: valid_common_property(vv) for k, vv in v.items()} + + +@dataclasses.dataclass +class DescriptionBase: + """Container for + - common properties (see http://w3c.github.io/csvw/metadata/#common-properties) + - @-properties. + """ + common_props: dict[str, Any] = dataclasses.field(default_factory=dict) + at_props: dict[str, Any] = dataclasses.field(default_factory=dict) + + @classmethod + def partition_properties( + cls, + d: Union[dict, Any], + type_name: Optional[str] = None, + strict: bool = True + ) -> Union[dict, None]: + """ + Partitions properties in d into `common_props`, `at_props` and the remaining. + """ + if d and not isinstance(d, dict): + return None + fields = {f.name: f for f in dataclasses.fields(cls)} + type_name = type_name or cls.__name__ + c, a, dd = {}, {}, {} + for k, v in (d or {}).items(): + if k.startswith('@'): + if k == '@id': + v = valid_id_property(v) + if k == '@type' and v != type_name: + raise ValueError(f'Invalid @type property {v} for {type_name}') + a[k[1:]] = v + elif ':' in k: + c[k] = valid_common_property(v) + else: + if strict and (k not in fields): + warnings.warn(f'Invalid property {k} for {type_name}') + else: + dd[k] = v + return dict(common_props=c, at_props=a, **dd) # pylint: disable=R1735 + + @classmethod + def fromvalue(cls, d: dict): + """Initialize instance from dict.""" + return cls(**cls.partition_properties(d)) + + def _iter_dict_items(self, omit_defaults) -> Generator[tuple[str, Any], None, None]: + def _asdict_single(v): + return v.asdict(omit_defaults=omit_defaults) if hasattr(v, 'asdict') else v + + def _asdict_multiple(v): + if isinstance(v, (list, tuple)): + return [_asdict_single(vv) for vv in v] + return _asdict_single(v) + + for k, v in sorted(self.at_props.items()): + yield '@' + k, _asdict_multiple(v) + + for k, v in sorted(self.common_props.items()): + yield k, _asdict_multiple(v) + + for k, v in dataclass_asdict(self, omit_defaults=omit_defaults).items(): + if k not in ('common_props', 'at_props'): + yield k, _asdict_multiple(v) + + def asdict(self, omit_defaults=True) -> collections.OrderedDict[str, Any]: + """Serialization as dict.""" + # Note: The `null` property is the only inherited, list-valued property where the default + # is not the empty list. Thus, to allow setting it to empty, we must treat `null` as + # special case here. + # See also https://www.w3.org/TR/tabular-metadata/#dfn-inherited-property + return collections.OrderedDict( + (k, v) for k, v in self._iter_dict_items(omit_defaults) + if (k == 'null' or (v not in ([], {})))) + + +def dialect_props(d: dict[str, Any]) -> dict: + """Slightly massage the a dialect specification into something accepted by our Dialect class.""" + if not isinstance(d, dict): + warnings.warn('Invalid dialect spec') + return {} + partitioned = DescriptionBase.partition_properties(d, type_name='Dialect', strict=False) + del partitioned['at_props'] + del partitioned['common_props'] + if partitioned.get('headerRowCount'): + partitioned['header'] = True + return partitioned diff --git a/src/csvw/utils.py b/src/csvw/utils.py index 109fc91..ee953f5 100644 --- a/src/csvw/utils.py +++ b/src/csvw/utils.py @@ -1,3 +1,4 @@ +import io import re import copy import html @@ -8,7 +9,39 @@ import warnings import collections import unicodedata -from typing import Callable, Any +from types import MethodType +from typing import Callable, Any, Union + +import requests + + +def log_or_raise(msg, log=None, level='warning', exception_cls=ValueError): + if log: + getattr(log, level)(msg) + else: + raise exception_cls(msg) + + +def nolog(level='warning'): + class Log(object): + pass + + log = Log() + setattr(log, level, MethodType(lambda *args, **kw: None, log)) + return log + + +def json_open(filename, mode='r', encoding='utf-8'): + assert encoding == 'utf-8' + return io.open(filename, mode, encoding=encoding) + + +def get_json(fname) -> Union[list, dict]: + fname = str(fname) + if is_url(fname): + return requests.get(fname).json(object_pairs_hook=collections.OrderedDict) + with json_open(fname) as f: + return json.load(f, object_pairs_hook=collections.OrderedDict) def optional(type_: type) -> Callable[[Any], Any]: @@ -38,21 +71,6 @@ def ensure_path(fname): return fname -def attr_asdict(obj, omit_defaults=True, omit_private=True): - import dataclasses - - res = collections.OrderedDict() - for field in dataclasses.fields(obj): - default = field.default_factory() if callable(field.default_factory) else field.default - if not (omit_private and field.name.startswith('_')): - value = getattr(obj, field.name) - if not (omit_defaults and value == default): - if hasattr(value, 'asdict'): - value = value.asdict(omit_defaults=True) - res[field.name] = value - return res - - def normalize_name(s): """Convert a string into a valid python attribute name. This function is called to convert ASCII strings to something that can pass as diff --git a/tests/conftest.py b/tests/conftest.py index 9991037..7f32cc7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,7 +8,8 @@ import pytest import attr -from csvw.metadata import CSVW, get_json +from csvw.metadata import CSVW +from csvw.utils import get_json def pytest_addoption(parser): diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 292e444..bc25fd9 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -8,7 +8,7 @@ import warnings import collections -from csvw.metadata import json_open +from csvw.utils import json_open import pytest diff --git a/tests/test_metadata_utils.py b/tests/test_metadata_utils.py new file mode 100644 index 0000000..26adbfd --- /dev/null +++ b/tests/test_metadata_utils.py @@ -0,0 +1,30 @@ +import dataclasses + +import pytest + +from csvw.metadata_utils import * + + +@pytest.fixture +def Dataclass(): + @dataclasses.dataclass + class Test: + _private: int = 5 + public: str = 'hello' + + return Test + + +@pytest.mark.parametrize( + 'data,kw,expected', + [ + (dict(), dict(), {}), + (dict(), dict(omit_defaults=False), {'public': 'hello'}), + (dict(), dict(omit_defaults=False, omit_private=False), {'public': 'hello', '_private': 5}), + (dict(), dict(omit_private=False), {}), + (dict(_private=1), dict(omit_private=False), {'_private': 1}), + (dict(public='world'), dict(), {'public': 'world'}), + ] +) +def test_dataclass_asdict(Dataclass, data, kw, expected): + assert dataclass_asdict(Dataclass(**data), **kw) == expected From 222c659867805b50c7c9b30cebcb12192f8bb2a5 Mon Sep 17 00:00:00 2001 From: Robert Forkel Date: Mon, 2 Mar 2026 19:20:59 +0100 Subject: [PATCH 06/17] more linting --- src/csvw/datatypes.py | 308 +++++++++++++++++++++++------------------- 1 file changed, 167 insertions(+), 141 deletions(-) diff --git a/src/csvw/datatypes.py b/src/csvw/datatypes.py index 3a5591c..3a3dfb1 100644 --- a/src/csvw/datatypes.py +++ b/src/csvw/datatypes.py @@ -8,6 +8,7 @@ .. seealso:: http://w3c.github.io/csvw/metadata/#datatypes """ +import functools import re import json as _json import math @@ -49,7 +50,7 @@ def to_binary(s, encoding='utf-8'): @register -class anyAtomicType: +class anyAtomicType: # pylint: disable=invalid-name """ A basic datatype consists of @@ -66,7 +67,7 @@ class anyAtomicType: @classmethod def value_error(cls, v): - raise ValueError('invalid lexical value for {}: {}'.format(cls.name, v)) + raise ValueError(f'invalid lexical value for {cls.name}: {v}') def __str__(self) -> str: return self.name @@ -85,7 +86,7 @@ def to_string(v: object, **kw) -> str: @register -class string(anyAtomicType): +class string(anyAtomicType): # pylint: disable=invalid-name """ Maps to `str`. @@ -113,7 +114,7 @@ def to_python(v, regex=None): @register -class anyURI(string): +class anyURI(string): # pylint: disable=invalid-name """ Maps to `rfc3986.URIReference`. @@ -148,7 +149,7 @@ def to_string(v, **kw): @register -class NMTOKEN(string): +class NMTOKEN(string): # pylint: disable=invalid-name """ Maps to `str` @@ -174,7 +175,7 @@ def to_python(v, regex=None): @register -class base64Binary(anyAtomicType): +class base64Binary(anyAtomicType): # pylint: disable=invalid-name """ Maps to `bytes` """ @@ -199,7 +200,7 @@ def to_string(v, **kw): @register -class _binary(base64Binary): +class _binary(base64Binary): # pylint: disable=invalid-name """ Maps to `bytes`. Alias for :class:`base64Binary` """ @@ -207,7 +208,7 @@ class _binary(base64Binary): @register -class hexBinary(anyAtomicType): +class hexBinary(anyAtomicType): # pylint: disable=invalid-name """ Maps to `bytes`. @@ -237,7 +238,7 @@ def to_string(v, **kw): @register -class boolean(anyAtomicType): +class boolean(anyAtomicType): # pylint: disable=invalid-name """ Maps to `bool`. @@ -290,7 +291,7 @@ def with_tz(v, func, args, kw): tz = tz.groups()[0] res = func(v, *args, **kw) if tz: - dt = dateutil.parser.parse('{}{}'.format(datetime.datetime.now(), tz)) + dt = dateutil.parser.parse(f'{datetime.datetime.now()}{tz}') res = datetime.datetime( res.year, res.month, res.day, res.hour, res.minute, res.second, res.microsecond, dt.tzinfo) @@ -298,7 +299,7 @@ def with_tz(v, func, args, kw): @register -class dateTime(anyAtomicType): +class dateTime(anyAtomicType): # pylint: disable=invalid-name """ Maps to `datetime.datetime`. """ @@ -339,7 +340,7 @@ def to_python(v, regex=None, fmt=None, tz_marker=None, pattern=None): if pattern and regex: match = regex.match(v) if not match: - raise ValueError('{} -- {} -- {}'.format(pattern, v, regex)) # pragma: + raise ValueError(f'{pattern} -- {v} -- {regex}') # pragma: try: return fromisoformat(v) except ValueError: @@ -353,7 +354,7 @@ def to_string(v, regex=None, fmt=None, tz_marker=None, pattern=None): @register -class _dateTime(dateTime): +class _dateTime(dateTime): # pylint: disable=invalid-name """ Maps to `datetime.datetime`. Alias for :class:`dateTime` """ @@ -361,7 +362,7 @@ class _dateTime(dateTime): @register -class date(dateTime): +class date(dateTime): # pylint: disable=invalid-name """ Maps to `datetime.datetime` (in order to be able to preserve timezone information). """ @@ -390,7 +391,7 @@ def to_string(v, regex=None, fmt=None, tz_marker=None, pattern=None): @register -class dateTimeStamp(dateTime): +class dateTimeStamp(dateTime): # pylint: disable=invalid-name """ Maps to `datetime.datetime`. """ @@ -406,7 +407,7 @@ def derived_description(datatype: "csvw.Datatype") -> dict: @register -class _time(dateTime): +class _time(dateTime): # pylint: disable=invalid-name """ Maps to `datetime.datetime` (in order to be able to preserve timezone information). """ @@ -430,7 +431,7 @@ def to_string(v, regex=None, fmt=None, tz_marker=None, pattern=None): @register -class duration(anyAtomicType): +class duration(anyAtomicType): # pylint: disable=invalid-name """ Maps to `datetime.timedelta`. @@ -462,7 +463,7 @@ def to_string(v, format=None, **kw): @register -class dayTimeDuration(duration): +class dayTimeDuration(duration): # pylint: disable=invalid-name """ Maps to `datetime.timedelta`. """ @@ -470,7 +471,7 @@ class dayTimeDuration(duration): @register -class yearMonthDuration(duration): +class yearMonthDuration(duration): # pylint: disable=invalid-name """ Maps to `datetime.timedelta`. """ @@ -478,7 +479,7 @@ class yearMonthDuration(duration): @register -class decimal(anyAtomicType): +class decimal(anyAtomicType): # pylint: disable=invalid-name """ Maps to `decimal.Decimal`. @@ -605,7 +606,7 @@ def repl(m): @register -class integer(decimal): +class integer(decimal): # pylint: disable=invalid-name """ Maps to `int`. """ @@ -617,7 +618,7 @@ def to_python(cls, v, **kw): res = decimal.to_python(v, **kw) numerator, denominator = res.as_integer_ratio() if denominator == 1: - if cls.range and not (cls.range[0] <= numerator <= cls.range[1]): + if cls.range and not cls.range[0] <= numerator <= cls.range[1]: raise ValueError("{} must be an integer between {} and {}, but got ".format( cls.name, cls.range[0], cls.range[1]), v) return numerator @@ -625,7 +626,7 @@ def to_python(cls, v, **kw): @register -class _int(integer): +class _int(integer): # pylint: disable=invalid-name """ Maps to `int`. Alias for :class:`integer`. """ @@ -633,7 +634,7 @@ class _int(integer): @register -class unsignedInt(integer): +class unsignedInt(integer): # pylint: disable=invalid-name """ Maps to `int`. @@ -652,7 +653,7 @@ class unsignedInt(integer): @register -class unsignedShort(integer): +class unsignedShort(integer): # pylint: disable=invalid-name """ Maps to `int`. @@ -671,7 +672,7 @@ class unsignedShort(integer): @register -class unsignedLong(integer): +class unsignedLong(integer): # pylint: disable=invalid-name """ Maps to `int`. @@ -690,7 +691,7 @@ class unsignedLong(integer): @register -class unsignedByte(integer): +class unsignedByte(integer): # pylint: disable=invalid-name """ Maps to `int`. @@ -712,7 +713,7 @@ class unsignedByte(integer): @register -class short(integer): +class short(integer): # pylint: disable=invalid-name """ Maps to `int`. @@ -731,7 +732,7 @@ class short(integer): @register -class long(integer): +class long(integer): # pylint: disable=invalid-name """ Maps to `int`. @@ -751,7 +752,7 @@ class long(integer): @register -class byte(integer): +class byte(integer): # pylint: disable=invalid-name """ Maps to `int`. @@ -771,7 +772,7 @@ class byte(integer): @register -class nonNegativeInteger(integer): +class nonNegativeInteger(integer): # pylint: disable=invalid-name """ Maps to `int`. """ @@ -780,7 +781,7 @@ class nonNegativeInteger(integer): @register -class positiveInteger(integer): +class positiveInteger(integer): # pylint: disable=invalid-name """ Maps to `int`. """ @@ -789,7 +790,7 @@ class positiveInteger(integer): @register -class nonPositiveInteger(integer): +class nonPositiveInteger(integer): # pylint: disable=invalid-name """ Maps to `int`. """ @@ -799,7 +800,7 @@ class nonPositiveInteger(integer): @register -class negativeInteger(integer): +class negativeInteger(integer): # pylint: disable=invalid-name """ Maps to `int`. """ @@ -809,7 +810,7 @@ class negativeInteger(integer): @register -class _float(anyAtomicType): +class _float(anyAtomicType): # pylint: disable=invalid-name """ Maps to `float`. @@ -848,7 +849,7 @@ def to_string(v, **kw): @register -class number(_float): +class number(_float): # pylint: disable=invalid-name """ Maps to `float`. """ @@ -856,7 +857,7 @@ class number(_float): @register -class double(_float): +class double(_float): # pylint: disable=invalid-name """ Maps to `float`. """ @@ -864,7 +865,7 @@ class double(_float): @register -class normalizedString(string): +class normalizedString(string): # pylint: disable=invalid-name """ Maps to `str`. @@ -899,7 +900,7 @@ class QName(string): @register -class gDay(string): +class gDay(string): # pylint: disable=invalid-name """ Maps to `str`. """ @@ -907,7 +908,7 @@ class gDay(string): @register -class gMonth(string): +class gMonth(string): # pylint: disable=invalid-name """ Maps to `str`. """ @@ -915,7 +916,7 @@ class gMonth(string): @register -class gMonthDay(string): +class gMonthDay(string): # pylint: disable=invalid-name """ Maps to `str`. """ @@ -923,7 +924,7 @@ class gMonthDay(string): @register -class gYear(string): +class gYear(string): # pylint: disable=invalid-name """ Maps to `str`. """ @@ -931,7 +932,7 @@ class gYear(string): @register -class gYearMonth(string): +class gYearMonth(string): # pylint: disable=invalid-name """ Maps to `str`. """ @@ -939,7 +940,7 @@ class gYearMonth(string): @register -class xml(string): +class xml(string): # pylint: disable=invalid-name """ Maps to `str`. """ @@ -947,7 +948,7 @@ class xml(string): @register -class html(string): +class html(string): # pylint: disable=invalid-name """ Maps to `str`. """ @@ -955,7 +956,7 @@ class html(string): @register -class json(string): +class json(string): # pylint: disable=invalid-name """ Maps to `str`, `list` or `dict`, i.e. to the result of `json.loads`. @@ -1010,7 +1011,7 @@ def derived_description(datatype: "csvw.Datatype") -> dict: # FIXME: ignored **kw? # why not just to_python = staticmethod(_json.loads)? @staticmethod - def to_python(v, schema=None, **kw): + def to_python(v, schema=None, **_): # pylint: disable=W0237 res = _json.loads(v, object_pairs_hook=collections.OrderedDict) if schema: try: @@ -1024,6 +1025,13 @@ def to_string(v, **kw): return _json.dumps(v) +def _get_sep(dfmt, options): + for d_sep in options: # Determine the separator used for date components. + if d_sep in dfmt: + return d_sep + return None + + def dt_format_and_regex(fmt, no_date=False): """ @@ -1063,28 +1071,10 @@ def dt_format_and_regex(fmt, no_date=False): "M.d.yyyy", # e.g., 3.22.2015 } time_patterns = {"HH:mm:ss", "HHmmss", "HH:mm", "HHmm"} - - # We map dateTime component markers to corresponding fromat specs and regular - # expressions used for formatting and parsing. - translate = { - 'yyyy': ('{dt.year:04d}', '(?P[0-9]{4})'), - 'MM': ('{dt.month:02d}', '(?P[0-9]{2})'), - 'dd': ('{dt.day:02d}', '(?P[0-9]{2})'), - 'M': ('{dt.month}', '(?P[0-9]{1,2})'), - 'd': ('{dt.day}', '(?P[0-9]{1,2})'), - 'HH': ('{dt.hour:02d}', '(?P[0-9]{2})'), - 'mm': ('{dt.minute:02d}', '(?P[0-9]{2})'), - 'ss': ('{dt.second:02d}', '(?P[0-9]{2})'), - } - - for dt_sep in ' T': # Only a single space or "T" may separate date and time format. - # Since space or "T" isn't allowed anywhere else in the format, checking whether - # we are dealing with a date or dateTime format is simple: - if dt_sep in fmt: - break - else: - dt_sep = None - + # Only a single space or "T" may separate date and time format. + # Since space or "T" isn't allowed anywhere else in the format, checking whether + # we are dealing with a date or dateTime format is simple: + dt_sep = _get_sep(fmt, ' T') if dt_sep: dfmt, tfmt = fmt.split(dt_sep) elif no_date: @@ -1103,50 +1093,51 @@ def dt_format_and_regex(fmt, no_date=False): if (dfmt and dfmt not in date_patterns) or (tfmt and tfmt not in time_patterns): raise ValueError(fmt) - regex, format = '', '' # Initialize the output. + regex, format = _get_regex_and_format(dfmt, tfmt, dt_sep, msecs) # pylint: disable=W0622 + return {'regex': re.compile(regex), 'fmt': format, 'tz_marker': tz_marker, 'pattern': pattern} - if dfmt: - for d_sep in '.-/': # Determine the separator used for date components. - if d_sep in dfmt: - break - else: - d_sep = None - if d_sep: - # Iterate over date components, converting them to string format specs and regular - # expressions. - for i, part in enumerate(dfmt.split(d_sep)): +def _get_regex_and_format(dfmt, tfmt, dt_sep, msecs): + def _add_chars(fmt, ff, rr, sep=None): + if sep: + for i, part in enumerate(fmt.split(sep)): if i > 0: - format += d_sep - regex += re.escape(d_sep) + ff += sep + rr += re.escape(sep) f, r = translate[part] - format += f - regex += r + ff += f + rr += r else: - for _, chars in itertools.groupby(dfmt, lambda k: k): + for _, chars in itertools.groupby(fmt, lambda k: k): f, r = translate[''.join(chars)] - format += f - regex += r + ff += f + rr += r + return ff, rr + + # We map dateTime component markers to corresponding fromat specs and regular + # expressions used for formatting and parsing. + translate = { + 'yyyy': ('{dt.year:04d}', '(?P[0-9]{4})'), + 'MM': ('{dt.month:02d}', '(?P[0-9]{2})'), + 'dd': ('{dt.day:02d}', '(?P[0-9]{2})'), + 'M': ('{dt.month}', '(?P[0-9]{1,2})'), + 'd': ('{dt.day}', '(?P[0-9]{1,2})'), + 'HH': ('{dt.hour:02d}', '(?P[0-9]{2})'), + 'mm': ('{dt.minute:02d}', '(?P[0-9]{2})'), + 'ss': ('{dt.second:02d}', '(?P[0-9]{2})'), + } + + regex, format = '', '' # Initialize the output. pylint: disable=redefined-builtin + + if dfmt: + format, regex = _add_chars(dfmt, format, regex, _get_sep(dfmt, '.-/')) if dt_sep: format += dt_sep regex += re.escape(dt_sep) if tfmt: - # For time components the only valid separator is ":". - if ':' in tfmt: - for i, part in enumerate(tfmt.split(':')): - if i > 0: - format += ':' - regex += re.escape(':') - f, r = translate[part] - format += f - regex += r - else: - for _, chars in itertools.groupby(tfmt, lambda k: k): - f, r = translate[''.join(chars)] - format += f - regex += r + format, regex = _add_chars(tfmt, format, regex, ':' if ':' in tfmt else None) # Fractions of seconds are a bit of a problem, because datetime objects only offer # microseconds. @@ -1154,8 +1145,7 @@ def dt_format_and_regex(fmt, no_date=False): format += '.{microsecond:.%s}' % msecs regex += r'(\.(?P[0-9]{1,%s})(?![0-9]))?' % msecs regex += r'(\.(?P[0-9]{%s,})(?![0-9]))?' % (msecs + 1,) - - return {'regex': re.compile(regex), 'fmt': format, 'tz_marker': tz_marker, 'pattern': pattern} + return regex, format class NumberPattern: @@ -1167,36 +1157,51 @@ class NumberPattern: The number of # placeholder characters before the decimal do not matter, since no limit is placed on the maximum number of digits. There should, however, be at least one zero someplace in the pattern. - """ + Example: #,##0.## + + .. seealso:: ``_ + """ def __init__(self, pattern): assert pattern.count(';') <= 1 self.positive, _, self.negative = pattern.partition(';') if not self.negative: self.negative = '-' + self.positive.replace('+', '') - @property - def primary_grouping_size(self): + @functools.cached_property + def primary_grouping_size(self) -> int: + """ + Number of digits in the primary grouping, i.e. the size of the chunk between the + secondary grouping character and the decimal point. + """ comps = self.positive.split('.')[0].split(',') if len(comps) > 1: return comps[-1].count('#') + comps[-1].count('0') - - @property - def secondary_grouping_size(self): + return 0 + + @functools.cached_property + def secondary_grouping_size(self) -> int: + """ + Number of digits in the secondary grouping, i.e. the size of the chunk between two + secondary grouping characters. + """ comps = self.positive.split('.')[0].split(',') if len(comps) > 2: return comps[1].count('#') + comps[1].count('0') return self.primary_grouping_size - @property - def min_digits_before_decimal_point(self): + @functools.cached_property + def min_digits_before_decimal_point(self) -> int: + """Number of 0s before the decimal point in the pattern.""" integral_part = self.positive.split('.')[0] match = re.search('([0]+)$', integral_part) if match: return len(match.groups()[0]) + return 0 - @property - def exponent_digits(self): + @functools.cached_property + def exponent_digits(self) -> int: + """Number of digits in the exponent in the pattern.""" _, _, exponent = self.positive.lower().partition('e') i = 0 for c in exponent: @@ -1208,8 +1213,9 @@ def exponent_digits(self): break return i - @property - def decimal_digits(self): + @functools.cached_property + def decimal_digits(self) -> int: + """Number of decimal digits in the pattern.""" i = 0 _, _, decimal_part = self.positive.partition('.') for c in decimal_part: @@ -1219,8 +1225,9 @@ def decimal_digits(self): break return i - @property - def significant_decimal_digits(self): + @functools.cached_property + def significant_decimal_digits(self) -> int: + """Number of *significant* decimal digits in the pattern, i.e. 0 counts, # does not.""" i = 0 _, _, decimal_part = self.positive.partition('.') for c in decimal_part: @@ -1230,14 +1237,10 @@ def significant_decimal_digits(self): break return i - def is_valid(self, s): - def digits(ss): - return [c for c in ss if c not in '.,E+-%‰'] - - integral_part, _, decimal_part = s.partition('.') - decimal_part, _, exponent = decimal_part.lower().partition('e') - groups = integral_part.split(',') + @staticmethod + def _get_significant(groups): significant, leadingzero, skip = [], False, True + for c in ''.join(groups): if c in ['+', '-', '%', # fixme: permil ]: @@ -1250,14 +1253,47 @@ def digits(ss): significant.append(c) if not significant and leadingzero: significant = ['0'] - if self.min_digits_before_decimal_point and \ - len(significant) < self.min_digits_before_decimal_point: + return significant + + def is_valid(self, s: str) -> bool: + """Validates a string representing a number against the pattern.""" + def digits(ss): + return [c for c in ss if c not in '.,E+-%‰'] + + integral_part, _, decimal_part = s.partition('.') + decimal_part, _, _ = decimal_part.lower().partition('e') + groups = integral_part.split(',') + significant = self._get_significant(groups) + + if any(( + all(( + self.min_digits_before_decimal_point, + len(significant) < self.min_digits_before_decimal_point)), + all(( + self.primary_grouping_size, + groups, + len(digits(groups[-1])) > self.primary_grouping_size)), + all(( + self.primary_grouping_size, + groups, + len(groups) > 1, + len(digits(groups[-1])) < self.primary_grouping_size)), + all(( + decimal_part, + len(digits(decimal_part)) > self.decimal_digits, + )), + all(( + self.significant_decimal_digits, + (not decimal_part) or (len(digits(decimal_part)) < self.significant_decimal_digits), + )), + all(( + self.exponent_digits, + 'e' in s.lower(), + len(digits(s.lower().split('e')[-1])) > self.exponent_digits + )), + )): return False - if self.primary_grouping_size and groups: - if len(digits(groups[-1])) > self.primary_grouping_size: - return False - if len(groups) > 1 and len(digits(groups[-1])) < self.primary_grouping_size: - return False + if self.secondary_grouping_size and len(groups) > 1: for i, group in enumerate(groups[:-1]): if i == 0: @@ -1266,15 +1302,5 @@ def digits(ss): else: if len(digits(group)) != self.secondary_grouping_size: return False - if decimal_part: - if len(digits(decimal_part)) > self.decimal_digits: - return False - if self.significant_decimal_digits: - if (not decimal_part) or (len(digits(decimal_part)) < self.significant_decimal_digits): - return False - - if self.exponent_digits and 'e' in s.lower(): - if len(digits(s.lower().split('e')[-1])) > self.exponent_digits: - return False return True From 47aaef910b71cb1c5f6979bed6f71f0fc56b902b Mon Sep 17 00:00:00 2001 From: Robert Forkel Date: Mon, 2 Mar 2026 20:02:44 +0100 Subject: [PATCH 07/17] bit more work --- src/csvw/datatypes.py | 94 +++++++++++++++++++++---------------------- src/csvw/dsv.py | 72 ++++++++++++++++----------------- src/csvw/metadata.py | 4 +- src/csvw/utils.py | 29 ++++++------- 4 files changed, 99 insertions(+), 100 deletions(-) diff --git a/src/csvw/datatypes.py b/src/csvw/datatypes.py index 3a3dfb1..142d0d5 100644 --- a/src/csvw/datatypes.py +++ b/src/csvw/datatypes.py @@ -13,7 +13,7 @@ import json as _json import math import base64 -import typing +from typing import Optional, TYPE_CHECKING, Any import decimal as _decimal import binascii import datetime @@ -30,7 +30,7 @@ from ._compat import fromisoformat -if typing.TYPE_CHECKING: # pragma: no cover +if TYPE_CHECKING: # pragma: no cover import csvw __all__ = ['DATATYPES'] @@ -66,22 +66,22 @@ class anyAtomicType: # pylint: disable=invalid-name example = 'x' @classmethod - def value_error(cls, v): + def value_error(cls, v): # pylint: disable=C0116 raise ValueError(f'invalid lexical value for {cls.name}: {v}') def __str__(self) -> str: return self.name @staticmethod - def derived_description(datatype: "csvw.Datatype") -> dict: + def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C0116 return {} @staticmethod - def to_python(v: str, **kw) -> object: + def to_python(v: str, **kw) -> Any: # pylint: disable=C0116 return v # pragma: no cover @staticmethod - def to_string(v: object, **kw) -> str: + def to_string(v: object, **kw) -> str: # pylint: disable=C0116 return '{}'.format(v) @@ -96,7 +96,7 @@ class string(anyAtomicType): # pylint: disable=invalid-name name = 'string' @staticmethod - def derived_description(datatype: "csvw.Datatype") -> dict: + def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C0116 if datatype.format: # We wrap a regex specified as `format` property into a group and add `$` to # make sure the whole string is matched when validating. @@ -107,7 +107,7 @@ def derived_description(datatype: "csvw.Datatype") -> dict: return {} @staticmethod - def to_python(v, regex=None): + def to_python(v, regex=None): # pylint: disable=C0116 if regex and not regex.match(v): string.value_error(v) return v @@ -132,12 +132,12 @@ class anyURI(string): # pylint: disable=invalid-name name = 'anyURI' @staticmethod - def to_python(v, regex=None): + def to_python(v, regex=None): # pylint: disable=C0116 res = string.to_python(v, regex=regex) return rfc3986.URIReference.from_string(res.encode('utf-8')) @staticmethod - def to_string(v, **kw): + def to_string(v, **kw): # pylint: disable=C0116 if hasattr(v, 'geturl'): # Presumably a `urllib.parse.ParseResult`. return v.geturl() @@ -167,7 +167,7 @@ class NMTOKEN(string): # pylint: disable=invalid-name name = "NMTOKEN" @staticmethod - def to_python(v, regex=None): + def to_python(v, regex=None): # pylint: disable=C0116 v = string.to_python(v, regex=regex) if not re.fullmatch(r'[\w.:-]*', v): NMTOKEN.value_error(v) @@ -183,7 +183,7 @@ class base64Binary(anyAtomicType): # pylint: disable=invalid-name example = 'YWJj' @staticmethod - def to_python(v, **kw): + def to_python(v, **kw): # pylint: disable=C0116 try: res = to_binary(v, encoding='ascii') except UnicodeEncodeError: @@ -195,7 +195,7 @@ def to_python(v, **kw): return res @staticmethod - def to_string(v, **kw): + def to_string(v, **kw): # pylint: disable=C0116 return base64.encodebytes(v).decode().strip() @@ -221,7 +221,7 @@ class hexBinary(anyAtomicType): # pylint: disable=invalid-name example = 'ab' @staticmethod - def to_python(v, **kw): + def to_python(v, **kw): # pylint: disable=C0116 try: res = to_binary(v, encoding='ascii') except UnicodeEncodeError: @@ -233,7 +233,7 @@ def to_python(v, **kw): return res @staticmethod - def to_string(v, **kw): + def to_string(v, **kw): # pylint: disable=C0116 return binascii.hexlify(v).decode().upper() @@ -258,7 +258,7 @@ class boolean(anyAtomicType): # pylint: disable=invalid-name example = 'false' @staticmethod - def derived_description(datatype: "csvw.Datatype") -> dict: + def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C0116 if datatype.format and isinstance(datatype.format, str) and datatype.format.count('|') == 1: true, false = [[v] for v in datatype.format.split('|')] else: @@ -269,7 +269,7 @@ def derived_description(datatype: "csvw.Datatype") -> dict: return {'true': true, 'false': false} @staticmethod - def to_python(s, true=('true', '1'), false=('false', '0')): + def to_python(s, true=('true', '1'), false=('false', '0')): # pylint: disable=C0116 if isinstance(s, bool) or s is None: return s if s in true: @@ -279,7 +279,7 @@ def to_python(s, true=('true', '1'), false=('false', '0')): raise boolean.value_error(s) @staticmethod - def to_string(v, true=('true', '1'), false=('false', '0')): + def to_string(v, true=('true', '1'), false=('false', '0')): # pylint: disable=C0116 return (true if v else false)[0] @@ -308,7 +308,7 @@ class dateTime(anyAtomicType): # pylint: disable=invalid-name example = '2018-12-10T20:20:20' @staticmethod - def derived_description(datatype: "csvw.Datatype") -> dict: + def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C0116 return dt_format_and_regex(datatype.format) @staticmethod @@ -336,7 +336,7 @@ def _parse(v, cls, regex, tz_marker=None): return res @staticmethod - def to_python(v, regex=None, fmt=None, tz_marker=None, pattern=None): + def to_python(v, regex=None, fmt=None, tz_marker=None, pattern=None): # pylint: disable=C0116 if pattern and regex: match = regex.match(v) if not match: @@ -347,7 +347,7 @@ def to_python(v, regex=None, fmt=None, tz_marker=None, pattern=None): return dateTime._parse(v, datetime.datetime, regex, tz_marker=tz_marker) @staticmethod - def to_string(v, regex=None, fmt=None, tz_marker=None, pattern=None): + def to_string(v, regex=None, fmt=None, tz_marker=None, pattern=None): # pylint: disable=C0116 if pattern: return babel.dates.format_datetime(v, tzinfo=v.tzinfo, format=pattern) return v.isoformat() @@ -370,7 +370,7 @@ class date(dateTime): # pylint: disable=invalid-name example = '2018-12-10' @staticmethod - def derived_description(datatype: "csvw.Datatype") -> dict: + def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C0116 try: return dt_format_and_regex(datatype.format or 'yyyy-MM-dd') except ValueError: @@ -378,12 +378,12 @@ def derived_description(datatype: "csvw.Datatype") -> dict: return dt_format_and_regex('yyyy-MM-dd') @staticmethod - def to_python(v, regex=None, fmt=None, tz_marker=None, pattern=None): + def to_python(v, regex=None, fmt=None, tz_marker=None, pattern=None): # pylint: disable=C0116 return with_tz( v.strip(), dateTime.to_python, [], dict(regex=regex, fmt=fmt, pattern=pattern)) @staticmethod - def to_string(v, regex=None, fmt=None, tz_marker=None, pattern=None): + def to_string(v, regex=None, fmt=None, tz_marker=None, pattern=None): # pylint: disable=C0116 from babel.dates import format_date if pattern: return format_date(v, format=pattern, locale='en') @@ -399,7 +399,7 @@ class dateTimeStamp(dateTime): # pylint: disable=invalid-name example = '2018-12-10T20:20:20' @staticmethod - def derived_description(datatype: "csvw.Datatype") -> dict: + def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C0116 res = dt_format_and_regex(datatype.format or 'yyyy-MM-ddTHH:mm:ss.SSSSSSXXX') if not res['tz_marker']: raise ValueError('dateTimeStamp must have timezone marker') @@ -415,18 +415,18 @@ class _time(dateTime): # pylint: disable=invalid-name example = '20:20:20' @staticmethod - def derived_description(datatype: "csvw.Datatype") -> dict: + def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C0116 return dt_format_and_regex(datatype.format or 'HH:mm:ss', no_date=True) @staticmethod - def to_python(v, regex=None, fmt=None, tz_marker=None, pattern=None): + def to_python(v, regex=None, fmt=None, tz_marker=None, pattern=None): # pylint: disable=C0116 if pattern and 'x' in pattern.lower(): return dateutil.parser.parse('{}T{}'.format(datetime.date.today().isoformat(), v)) assert regex is not None return with_tz(v, dateTime._parse, [datetime.datetime, regex], dict(tz_marker=tz_marker)) @staticmethod - def to_string(v, regex=None, fmt=None, tz_marker=None, pattern=None): + def to_string(v, regex=None, fmt=None, tz_marker=None, pattern=None): # pylint: disable=C0116 return babel.dates.format_time(v, tzinfo=v.tzinfo, format=pattern) @@ -448,17 +448,17 @@ class duration(anyAtomicType): # pylint: disable=invalid-name example = 'P3Y6M4DT12H30M5S' @staticmethod - def derived_description(datatype: "csvw.Datatype") -> dict: + def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C0116 return {'format': datatype.format} @staticmethod - def to_python(v, format=None, **kw): + def to_python(v, format=None, **kw): # pylint: disable=C0116 if format and not re.match(format, v): raise ValueError return isodate.parse_duration(v) @staticmethod - def to_string(v, format=None, **kw): + def to_string(v, format=None, **kw): # pylint: disable=C0116 return isodate.duration_isoformat(v) @@ -523,14 +523,14 @@ class decimal(anyAtomicType): # pylint: disable=invalid-name _reverse_special = {v: k for k, v in _special.items()} @staticmethod - def derived_description(datatype: "csvw.Datatype") -> dict: + def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C0116 if datatype.format: return datatype.format if isinstance(datatype.format, dict) \ else {'pattern': datatype.format} return {} @staticmethod - def to_python(v, pattern=None, decimalChar=None, groupChar=None): + def to_python(v, pattern=None, decimalChar=None, groupChar=None): # pylint: disable=C0116 if isinstance(v, str) and 'e' in v.lower(): raise ValueError('Invalid value for decimal') @@ -567,9 +567,9 @@ def to_python(v, pattern=None, decimalChar=None, groupChar=None): decimal.value_error(v) @staticmethod - def to_string(v, pattern=None, decimalChar=None, groupChar=None): - if '{}'.format(v) in decimal._reverse_special: - return decimal._reverse_special['{}'.format(v)] + def to_string(v, pattern=None, decimalChar=None, groupChar=None): # pylint: disable=C0116 + if f'{v}' in decimal._reverse_special: + return decimal._reverse_special[f'{v}'] if pattern: v = babel.numbers.format_decimal(v, pattern, 'en') @@ -591,8 +591,8 @@ def to_string(v, pattern=None, decimalChar=None, groupChar=None): exp = int(exp) zero_padding = '0' * (abs(int(exp)) - 1) sign = '-' if neg else '' - return '{}{}{}.0'.format(sign, digits, zero_padding) if exp > 0 else ( - '{}0.{}{}'.format(sign, zero_padding, digits)) + return f'{sign}{digits}{zero_padding}.0' if exp > 0 \ + else f'{sign}0.{zero_padding}{digits}' if groupChar or decimalChar: def repl(m): @@ -611,16 +611,17 @@ class integer(decimal): # pylint: disable=invalid-name Maps to `int`. """ name = 'integer' - range = None + range: Optional[tuple[int, int]] = None @classmethod - def to_python(cls, v, **kw): + def to_python(cls, v, **kw): # pylint: disable=C0116 res = decimal.to_python(v, **kw) numerator, denominator = res.as_integer_ratio() if denominator == 1: if cls.range and not cls.range[0] <= numerator <= cls.range[1]: - raise ValueError("{} must be an integer between {} and {}, but got ".format( - cls.name, cls.range[0], cls.range[1]), v) + raise ValueError( + f"{cls.name} must be an integer between {cls.range[0]} and {cls.range[1]}, " + f"but got ", v) return numerator raise ValueError('Invalid value for integer') @@ -833,10 +834,9 @@ def derived_description(datatype: "csvw.Datatype") -> dict: return {} @staticmethod - def to_python(v, pattern=None, **kw): + def to_python(v, pattern=None, **kw): # pylint: disable=R1710 if pattern and not NumberPattern(pattern).is_valid(v): - raise ValueError( - 'Invalid value "{}" for number with pattern "{}"'.format(v, pattern)) + raise ValueError(f'Invalid value "{v}" for number with pattern "{pattern}"') try: return float(v) @@ -844,8 +844,8 @@ def to_python(v, pattern=None, **kw): _float.value_error(v) @staticmethod - def to_string(v, **kw): - return '{}'.format(v) + def to_string(v, **_): # pylint: disable=C0116 + return f'{v}' @register diff --git a/src/csvw/dsv.py b/src/csvw/dsv.py index 0d050d2..fd1cf64 100644 --- a/src/csvw/dsv.py +++ b/src/csvw/dsv.py @@ -15,12 +15,13 @@ import csv import codecs import shutil -import typing +from typing import Optional, Union, IO, Callable import pathlib import tempfile import warnings import functools import collections +from collections.abc import Iterable, Generator from . import utils from .dsv_dialects import Dialect @@ -32,7 +33,8 @@ 'rewrite', 'add_rows', 'filter_rows_as_dict', ] -LINES_OR_PATH = typing.Union[str, pathlib.Path, typing.IO, typing.Iterable[str]] +PathType = Union[str, pathlib.Path] +LinesOrPath = Union[PathType, IO, Iterable[str]] def normalize_encoding(encoding: str) -> str: @@ -60,8 +62,8 @@ class UnicodeWriter: def __init__( self, - f: typing.Optional[typing.Union[str, pathlib.Path]] = None, - dialect: typing.Optional[typing.Union[Dialect, str]] = None, + f: Optional[PathType] = None, + dialect: Optional[Union[Dialect, str]] = None, **kw): self.f = f self.encoding = kw.pop('encoding', 'utf-8') @@ -103,7 +105,7 @@ def __enter__(self): self.writer = csv.writer(self.f, **self.kw) return self - def read(self) -> typing.Optional[bytes]: + def read(self) -> Optional[bytes]: """ If the writer has been initialized passing `None` as target, the CSV data as `bytes` can be retrieved calling this method. @@ -112,16 +114,17 @@ def read(self) -> typing.Optional[bytes]: self.f.seek(0) if hasattr(self.f, 'read'): return self.f.read().encode('utf-8') + return None # pragma: no cover def __exit__(self, type, value, traceback): if self._close: self.f.close() - def writerow(self, row: typing.Iterable[typing.Union[str, None]]): + def writerow(self, row: Iterable[Union[str, None]]): self.writer.writerow(self._escapedoubled(row)) self._rows_written += 1 - def writerows(self, rows: typing.Iterable[typing.Union[tuple, list, dict]]): + def writerows(self, rows: Iterable[Union[tuple, list, dict]]): """ Writes each row in `rows` formatted as CSV row. This behaves as [`csvwriter.writerows`](https://docs.python.org/3/library/csv.html#csv.csvwriter.writerows) @@ -164,8 +167,8 @@ class UnicodeReader: """ def __init__( self, - f: LINES_OR_PATH, - dialect: typing.Optional[typing.Union[Dialect, str]] = None, + f: LinesOrPath, + dialect: Optional[Union[Dialect, str]] = None, **kw): self.f = f self.encoding = normalize_encoding(kw.pop('encoding', 'utf-8-sig')) @@ -292,7 +295,7 @@ def __init__(self, f, fieldnames=None, restkey=None, restval=None, **kw): def fieldnames(self): if self._fieldnames is None: try: - self._fieldnames = super(UnicodeDictReader, self).__next__() + self._fieldnames = super().__next__() except StopIteration: pass self.line_num = self.reader.line_num @@ -305,17 +308,17 @@ def __next__(self) -> collections.OrderedDict: if self.line_num == 0: # Used only for its side effect. self.fieldnames - row = super(UnicodeDictReader, self).__next__() + row = super().__next__() self.line_num = self.reader.line_num # unlike the basic reader, we prefer not to return blanks, # because we will typically wind up with a dict full of None # values while row == []: - row = super(UnicodeDictReader, self).__next__() + row = super().__next__() return self.item(row) - def item(self, row) -> collections.OrderedDict: + def item(self, row) -> collections.OrderedDict[str, str]: d = collections.OrderedDict((k, v) for k, v in zip(self.fieldnames, row)) lf = len(self.fieldnames) lr = len(row) @@ -352,11 +355,11 @@ def item(self, row): **{self._normalize_fieldname(k): v for k, v in d.items() if k in self.fieldnames}) -def iterrows(lines_or_file: LINES_OR_PATH, - namedtuples: typing.Optional[bool] = False, - dicts: typing.Optional[bool] = False, - encoding: typing.Optional[str] = 'utf-8', - **kw) -> typing.Generator: +def iterrows(lines_or_file: LinesOrPath, + namedtuples: Optional[bool] = False, + dicts: Optional[bool] = False, + encoding: Optional[str] = 'utf-8', + **kw) -> Generator: """Convenience factory function for csv reader. :param lines_or_file: Content to be read. Either a file handle, a file path or a list\ @@ -377,16 +380,13 @@ def iterrows(lines_or_file: LINES_OR_PATH, _reader = UnicodeReader with _reader(lines_or_file, encoding=encoding, **kw) as r: - for item in r: - yield item + yield from r reader = iterrows -def rewrite(fname: typing.Union[str, pathlib.Path], - visitor: typing.Callable[[int, typing.List[str]], typing.Union[None, typing.List[str]]], - **kw): +def rewrite(fname: PathType, visitor: Callable[[int, list[str]], Union[None, list[str]]], **kw): """Utility function to rewrite rows in dsv files. :param fname: Path of the dsv file to operate on. @@ -405,10 +405,10 @@ def rewrite(fname: typing.Union[str, pathlib.Path], row = visitor(i, row) if row is not None: writer.writerow(row) - shutil.move(str(tmp), str(fname)) # Path.replace is Python 3.3+ + shutil.move(tmp, fname) -def add_rows(fname: typing.Union[str, pathlib.Path], *rows: typing.List[str]): +def add_rows(fname: PathType, *rows: list[str]): with tempfile.NamedTemporaryFile(delete=False) as fp: tmp = pathlib.Path(fp.name) @@ -419,12 +419,10 @@ def add_rows(fname: typing.Union[str, pathlib.Path], *rows: typing.List[str]): for row in reader_: writer.writerow(row) writer.writerows(rows) - shutil.move(str(tmp), str(fname)) # Path.replace is Python 3.3+ + shutil.move(tmp, fname) -def filter_rows_as_dict(fname: typing.Union[str, pathlib.Path], - filter_: typing.Callable[[dict], bool], - **kw) -> int: +def filter_rows_as_dict(fname: PathType, filter_: Callable[[dict], bool], **kw) -> int: """Rewrite a dsv file, filtering the rows. :param fname: Path to dsv file @@ -439,14 +437,14 @@ def filter_rows_as_dict(fname: typing.Union[str, pathlib.Path], return filter_.removed -class DictFilter(object): - - def __init__(self, filter_): - self.header = None +class DictFilter: # pylint: disable=R0903 + """Utility to apply a filter to a row as dict, while iterating over rows a list.""" + def __init__(self, filter_: Callable[[dict[str, str]], bool]): + self.header: Optional[list[str]] = None self.filter = filter_ - self.removed = 0 + self.removed: int = 0 - def __call__(self, i, row): + def __call__(self, i: int, row: list[str]) -> Optional[list[str]]: if i == 0: self.header = row return row @@ -454,5 +452,5 @@ def __call__(self, i, row): item = dict(zip(self.header, row)) if self.filter(item): return row - else: - self.removed += 1 + self.removed += 1 + return None diff --git a/src/csvw/metadata.py b/src/csvw/metadata.py index 3337716..9e296a1 100644 --- a/src/csvw/metadata.py +++ b/src/csvw/metadata.py @@ -100,7 +100,7 @@ def asdict(self, **_): def convert_uri_template(v): if v is None: - return None + return None # pragma: no cover if not isinstance(v, str): warnings.warn('Invalid value for Url property') return INVALID @@ -121,7 +121,7 @@ def __init__(self, string: Union[str, pathlib.Path]): @classmethod def from_value(cls, v: Union['Link', str, pathlib.Path]): if isinstance(v, Link): - return v + return v # pragma: no cover return cls(v) def __str__(self): diff --git a/src/csvw/utils.py b/src/csvw/utils.py index ee953f5..83d97fb 100644 --- a/src/csvw/utils.py +++ b/src/csvw/utils.py @@ -1,4 +1,5 @@ import io +import logging import re import copy import html @@ -10,12 +11,16 @@ import collections import unicodedata from types import MethodType -from typing import Callable, Any, Union +from typing import Callable, Any, Union, Optional import requests -def log_or_raise(msg, log=None, level='warning', exception_cls=ValueError): +def log_or_raise( + msg: str, + log: Optional[logging.Logger] = None, + level: str = 'warning', + exception_cls: type = ValueError): if log: getattr(log, level)(msg) else: @@ -23,7 +28,7 @@ def log_or_raise(msg, log=None, level='warning', exception_cls=ValueError): def nolog(level='warning'): - class Log(object): + class Log: pass log = Log() @@ -144,8 +149,8 @@ def qname2link(qname, html=False): url = qname2url(qname) if url: if html: - return '{}'.format(url, qname) - return '[{}]({})'.format(qname, url) + return f'{qname}' + return f'[{qname}]({url})' return qname def htmlify(obj, key=None): @@ -209,15 +214,11 @@ def colrow(col, fks, pk): fks[col.name][1], fks[col.name][0], slug(fks[col.name][1])) return ' | '.join([ - '[{}]({})'.format(col.name, col.propertyUrl) - if col.propertyUrl else '`{}`'.format(col.name), - dt, - desc, - ]) + f'[{col.name}]({col.propertyUrl})' if col.propertyUrl else f'`{col.name}`', dt, desc]) res = ['# {}\n'.format(tg.common_props.get('dc:title', 'Dataset'))] if tg._fname and link_files: - res.append('> [!NOTE]\n> Described by [{0}]({0}).\n'.format(tg._fname.name)) + res.append(f'> [!NOTE]\n> Described by [{tg._fname.name}]({tg._fname.name}).\n') res.append(properties({k: v for k, v in tg.common_props.items() if k != 'dc:title'})) @@ -225,16 +226,16 @@ def colrow(col, fks, pk): fks = { fk.columnReference[0]: (fk.reference.columnReference[0], fk.reference.resource.string) for fk in table.tableSchema.foreignKeys if len(fk.columnReference) == 1} - header = '## Table '.format(slug(table.url.string)) + header = f'## Table ' if link_files and tg._fname and tg._fname.parent.joinpath(table.url.string).exists(): - header += '[{0}]({0})\n'.format(table.url.string) + header += f'[{table.url.string}]({table.url.string})\n' else: # pragma: no cover header += table.url.string res.append('\n' + header + '\n') res.append(properties(table.common_props)) dialect = table.inherit('dialect') if dialect.asdict(): - res.append('\n**CSV dialect**: `{}`\n'.format(json.dumps(dialect.asdict()))) + res.append(f'\n**CSV dialect**: `{json.dumps(dialect.asdict())}`\n') res.append('\n### Columns\n') res.append('Name/Property | Datatype | Description') res.append(' --- | --- | --- ') From 3b5ca0af1dd29de8165acd9384a953045a72295b Mon Sep 17 00:00:00 2001 From: Robert Forkel Date: Thu, 5 Mar 2026 09:52:31 +0100 Subject: [PATCH 08/17] more work --- src/csvw/datatypes.py | 79 +++++++++---------- src/csvw/db.py | 171 +++++++++++++++++++++++------------------- src/csvw/jsonld.py | 90 ++++++++++++++-------- src/csvw/utils.py | 21 +++--- 4 files changed, 201 insertions(+), 160 deletions(-) diff --git a/src/csvw/datatypes.py b/src/csvw/datatypes.py index 142d0d5..e1daf4e 100644 --- a/src/csvw/datatypes.py +++ b/src/csvw/datatypes.py @@ -1,3 +1,4 @@ +# pylint: disable=C0302 """ We model the hierarchy of basic datatypes using a class hierarchy. @@ -25,6 +26,7 @@ import rfc3986 import babel.numbers import babel.dates +from babel.dates import format_date import jsonschema import dateutil.parser @@ -38,12 +40,12 @@ DATATYPES = {} -def register(cls): +def register(cls): # pylint: disable=C0116 DATATYPES[cls.name] = cls return cls -def to_binary(s, encoding='utf-8'): +def to_binary(s, encoding='utf-8'): # pylint: disable=C0116 if not isinstance(s, bytes): return bytes(s, encoding=encoding) return s # pragma: no cover @@ -73,16 +75,16 @@ def __str__(self) -> str: return self.name @staticmethod - def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C0116 + def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C0116,W0613 return {} @staticmethod - def to_python(v: str, **kw) -> Any: # pylint: disable=C0116 + def to_python(v: str, **_) -> Any: # pylint: disable=C0116 return v # pragma: no cover @staticmethod - def to_string(v: object, **kw) -> str: # pylint: disable=C0116 - return '{}'.format(v) + def to_string(v: object, **_) -> str: # pylint: disable=C0116 + return f'{v}' @register @@ -137,7 +139,7 @@ def to_python(v, regex=None): # pylint: disable=C0116 return rfc3986.URIReference.from_string(res.encode('utf-8')) @staticmethod - def to_string(v, **kw): # pylint: disable=C0116 + def to_string(v, **_): # pylint: disable=C0116 if hasattr(v, 'geturl'): # Presumably a `urllib.parse.ParseResult`. return v.geturl() @@ -183,19 +185,19 @@ class base64Binary(anyAtomicType): # pylint: disable=invalid-name example = 'YWJj' @staticmethod - def to_python(v, **kw): # pylint: disable=C0116 + def to_python(v, **_): # pylint: disable=C0116 try: res = to_binary(v, encoding='ascii') except UnicodeEncodeError: base64Binary.value_error(v[:10]) try: res = base64.decodebytes(res) - except Exception: - raise ValueError('invalid base64 encoding') + except Exception as e: + raise ValueError('invalid base64 encoding') from e return res @staticmethod - def to_string(v, **kw): # pylint: disable=C0116 + def to_string(v, **_): # pylint: disable=C0116 return base64.encodebytes(v).decode().strip() @@ -221,19 +223,19 @@ class hexBinary(anyAtomicType): # pylint: disable=invalid-name example = 'ab' @staticmethod - def to_python(v, **kw): # pylint: disable=C0116 + def to_python(v, **_): # pylint: disable=C0116 try: res = to_binary(v, encoding='ascii') except UnicodeEncodeError: hexBinary.value_error(v[:10]) try: res = binascii.unhexlify(res) - except (binascii.Error, TypeError): - raise ValueError('invalid hexBinary encoding') + except (binascii.Error, TypeError) as e: + raise ValueError('invalid hexBinary encoding') from e return res @staticmethod - def to_string(v, **kw): # pylint: disable=C0116 + def to_string(v, **_): # pylint: disable=C0116 return binascii.hexlify(v).decode().upper() @@ -347,7 +349,7 @@ def to_python(v, regex=None, fmt=None, tz_marker=None, pattern=None): # pylint: return dateTime._parse(v, datetime.datetime, regex, tz_marker=tz_marker) @staticmethod - def to_string(v, regex=None, fmt=None, tz_marker=None, pattern=None): # pylint: disable=C0116 + def to_string(v, regex=None, pattern=None, **_): # pylint: disable=C0116 if pattern: return babel.dates.format_datetime(v, tzinfo=v.tzinfo, format=pattern) return v.isoformat() @@ -380,11 +382,10 @@ def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C @staticmethod def to_python(v, regex=None, fmt=None, tz_marker=None, pattern=None): # pylint: disable=C0116 return with_tz( - v.strip(), dateTime.to_python, [], dict(regex=regex, fmt=fmt, pattern=pattern)) + v.strip(), dateTime.to_python, [], {'regex': regex, 'fmt': fmt, 'pattern': pattern}) @staticmethod def to_string(v, regex=None, fmt=None, tz_marker=None, pattern=None): # pylint: disable=C0116 - from babel.dates import format_date if pattern: return format_date(v, format=pattern, locale='en') return dateTime.to_string(v, regex=regex, fmt=fmt, tz_marker=tz_marker, pattern=pattern) @@ -421,12 +422,12 @@ def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C @staticmethod def to_python(v, regex=None, fmt=None, tz_marker=None, pattern=None): # pylint: disable=C0116 if pattern and 'x' in pattern.lower(): - return dateutil.parser.parse('{}T{}'.format(datetime.date.today().isoformat(), v)) + return dateutil.parser.parse(f'{datetime.date.today().isoformat()}T{v}') assert regex is not None - return with_tz(v, dateTime._parse, [datetime.datetime, regex], dict(tz_marker=tz_marker)) + return with_tz(v, dateTime._parse, [datetime.datetime, regex], {'tz_marker': tz_marker}) @staticmethod - def to_string(v, regex=None, fmt=None, tz_marker=None, pattern=None): # pylint: disable=C0116 + def to_string(v, regex=None, pattern=None, **_): # pylint: disable=C0116 return babel.dates.format_time(v, tzinfo=v.tzinfo, format=pattern) @@ -452,13 +453,13 @@ def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C return {'format': datatype.format} @staticmethod - def to_python(v, format=None, **kw): # pylint: disable=C0116 + def to_python(v, format=None, **_): # pylint: disable=C0116,W0622 if format and not re.match(format, v): raise ValueError return isodate.parse_duration(v) @staticmethod - def to_string(v, format=None, **kw): # pylint: disable=C0116 + def to_string(v, format=None, **_): # pylint: disable=C0116,W0613,W0622 return isodate.duration_isoformat(v) @@ -530,11 +531,12 @@ def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C return {} @staticmethod - def to_python(v, pattern=None, decimalChar=None, groupChar=None): # pylint: disable=C0116 - if isinstance(v, str) and 'e' in v.lower(): - raise ValueError('Invalid value for decimal') - - if isinstance(v, str) and re.search('{0}{0}+'.format(re.escape(groupChar or ',')), v): + def to_python(v, pattern=None, decimalChar=None, groupChar=None): # pylint: disable=C0116,W0221 + if any(( + isinstance(v, str) and 'e' in v.lower(), + isinstance(v, str) and # noqa: W504 + re.search(f"{re.escape(groupChar or ',')}{re.escape(groupChar or ',')}+", v), + )): raise ValueError('Invalid value for decimal') if groupChar is None and pattern and ',' in pattern: @@ -543,8 +545,7 @@ def to_python(v, pattern=None, decimalChar=None, groupChar=None): # pylint: dis decimalChar = '.' if pattern and not NumberPattern(pattern).is_valid( v.replace(groupChar or ',', ',').replace(decimalChar or '.', '.')): - raise ValueError( - 'Invalid value "{}" for decimal with pattern "{}"'.format(v, pattern)) + raise ValueError(f'Invalid value "{v}" for decimal with pattern "{pattern}"') factor = 1 if isinstance(v, str): @@ -565,9 +566,10 @@ def to_python(v, pattern=None, decimalChar=None, groupChar=None): # pylint: dis return _decimal.Decimal(v) * factor except (TypeError, _decimal.InvalidOperation): decimal.value_error(v) + return None # pragma: no cover @staticmethod - def to_string(v, pattern=None, decimalChar=None, groupChar=None): # pylint: disable=C0116 + def to_string(v, pattern=None, decimalChar=None, groupChar=None): # pylint: disable=C0116,W0221 if f'{v}' in decimal._reverse_special: return decimal._reverse_special[f'{v}'] @@ -600,7 +602,8 @@ def repl(m): return groupChar if m.group('c') == '.': return decimalChar - r = '(?P[{}])'.format(re.escape((decimalChar or '') + (groupChar or ''))) + return None + r = f"(?P[{re.escape((decimalChar or '') + (groupChar or ''))}])" v = re.sub(r, repl, v) return v @@ -614,14 +617,14 @@ class integer(decimal): # pylint: disable=invalid-name range: Optional[tuple[int, int]] = None @classmethod - def to_python(cls, v, **kw): # pylint: disable=C0116 + def to_python(cls, v, **kw): # pylint: disable=C0116,W0221 res = decimal.to_python(v, **kw) numerator, denominator = res.as_integer_ratio() if denominator == 1: - if cls.range and not cls.range[0] <= numerator <= cls.range[1]: + if cls.range and not cls.range[0] <= numerator <= cls.range[1]: # pylint: disable=E1136 raise ValueError( - f"{cls.name} must be an integer between {cls.range[0]} and {cls.range[1]}, " - f"but got ", v) + f"{cls.name} must be an integer between " + f"{cls.range[0]} and {cls.range[1]}, but got ", v) # pylint: disable=E1136 return numerator raise ValueError('Invalid value for integer') @@ -834,7 +837,7 @@ def derived_description(datatype: "csvw.Datatype") -> dict: return {} @staticmethod - def to_python(v, pattern=None, **kw): # pylint: disable=R1710 + def to_python(v, pattern=None, **_): # pylint: disable=R1710 if pattern and not NumberPattern(pattern).is_valid(v): raise ValueError(f'Invalid value "{v}" for number with pattern "{pattern}"') @@ -1021,7 +1024,7 @@ def to_python(v, schema=None, **_): # pylint: disable=W0237 return res @staticmethod - def to_string(v, **kw): + def to_string(v, **_): return _json.dumps(v) diff --git a/src/csvw/db.py b/src/csvw/db.py index 7f947d2..41c16be 100644 --- a/src/csvw/db.py +++ b/src/csvw/db.py @@ -33,7 +33,7 @@ import functools import contextlib import collections -from collections.abc import Sequence +from collections.abc import Sequence, Iterator import dataclasses import csvw @@ -436,52 +436,18 @@ def read(self) -> dict[str, list[collections.OrderedDict]]: # FIXME: how much do we want to use DB types? Probably as much as possible! # Thus we need to convert on write **and** read! # - convert, seps, refs = {}, {}, collections.defaultdict(dict) - table = self.tdict[tname] # The TableSpec object. - - # Assemble the conversion dictionary: - for col in table.columns: - convert[self.translate(tname, col.name)] = [col.name, identity] - if col.csvw_type in TYPE_MAP: - convert[self.translate(tname, col.name)][1] = \ - TYPE_MAP[col.csvw_type].convert - else: - convert[self.translate(tname, col.name)][1] = \ - DATATYPES[col.csvw_type].to_python - if col.separator: - if col.csvw_type == 'string': - seps[self.translate(tname, col.name)] = col.separator - else: - seps[self.translate(tname, col.name)] = 'json' - + spec = TableReadSpec(self.tdict[tname], tname, self.translate) # Retrieve the many-to-many relations: - for col, at in table.many_to_many.items(): + for col, at in spec.table.many_to_many.items(): for pk, v in self.select_many_to_many(conn, at, col).items(): - refs[pk][self.translate(tname, col)] = v + spec.references[pk][self.translate(tname, col)] = v cols, rows = select(conn, self.translate(tname)) for row in rows: - d = collections.OrderedDict() - for k, v in zip(cols, row): - if k in seps: - if v is None: - d[k] = None - elif not v: - d[k] = [] - elif seps[k] == 'json': - d[k] = json.loads(v) - else: - d[k] = [convert[k][1](v_) for v_ in (v or '').split(seps[k])] - else: - d[k] = convert[k][1](v) if v is not None else None - pk = d[self.translate(tname, table.primary_key[0])] \ - if table.primary_key and len(table.primary_key) == 1 else None - d.update({k: [] for k in table.many_to_many}) - d.update(refs.get(pk, {})) - res[self.translate(tname)].append(d) + res[self.translate(tname)].append(spec.read_row(zip(cols, row))) return res - def association_table_context(self, table, column, fkey): + def association_table_context(self, _, column, fkey): """ Context for association tables is created calling this method. @@ -498,12 +464,50 @@ def association_table_context(self, table, column, fkey): return fkey, column def write_from_tg(self, _force=False, _exists_ok=False, _skip_extra=False): + """Write the data from the contained tablegroup to a db.""" return self.write( force=_force, _exists_ok=_exists_ok, _skip_extra=_skip_extra, **self.tg.read()) + def _get_rows(self, t, items, refs, _skip_extra): + rows, keys = [], [] + cols = {c.name: c for c in t.columns} + for i, row in enumerate(items): + pk = row[t.primary_key[0]] if t.primary_key and len(t.primary_key) == 1 else None + values = [] + for k, v in row.items(): + if k in t.many_to_many: + assert pk + atkey = tuple([t.many_to_many[k].name] + # noqa: W504 + [c.name for c in t.many_to_many[k].columns]) + # We distinguish None - meaning NULL - and [] - meaning no items - as + # values of list-valued columns. + refs[atkey] = [ + tuple([pk] + list(self.association_table_context(t, k, vv))) + for vv in (v or [])] + else: + if k not in cols: + if _skip_extra: + continue + raise ValueError(f'unspecified column {k} found in data') + col = cols[k] + if isinstance(v, list): + # Note: This assumes list-valued columns are of datatype string! + if col.csvw_type == 'string': + v = (col.separator or ';').join( + col.db_type.convert(vv) or '' for vv in v) + else: + v = json.dumps(v) + else: + v = col.db_type.convert(v) if v is not None else None + if i == 0: + keys.append(col.name) + values.append(v) + rows.append(tuple(values)) + return rows, keys + def write(self, *, force=False, _exists_ok=False, _skip_extra=False, **items): """ Creates a db file with the core schema. @@ -513,8 +517,7 @@ def write(self, *, force=False, _exists_ok=False, _skip_extra=False, **items): if self.fname and self.fname.exists(): if not force: raise ValueError('db file already exists, use force=True to overwrite') - else: - self.fname.unlink() + self.fname.unlink() with self.connection() as db: for table in self.tables: @@ -527,46 +530,56 @@ def write(self, *, force=False, _exists_ok=False, _skip_extra=False, **items): for t in self.tables: if t.name not in items: continue - rows, keys = [], [] - cols = {c.name: c for c in t.columns} - for i, row in enumerate(items[t.name]): - pk = row[t.primary_key[0]] \ - if t.primary_key and len(t.primary_key) == 1 else None - values = [] - for k, v in row.items(): - if k in t.many_to_many: - assert pk - at = t.many_to_many[k] - atkey = tuple([at.name] + [c.name for c in at.columns]) - # We distinguish None - meaning NULL - and [] - meaning no items - as - # values of list-valued columns. - for vv in (v or []): - fkey, context = self.association_table_context(t, k, vv) - refs[atkey].append((pk, fkey, context)) - else: - if k not in cols: - if _skip_extra: - continue - else: - raise ValueError( - 'unspecified column {0} found in data'.format(k)) - col = cols[k] - if isinstance(v, list): - # Note: This assumes list-valued columns are of datatype string! - if col.csvw_type == 'string': - v = (col.separator or ';').join( - col.db_type.convert(vv) or '' for vv in v) - else: - v = json.dumps(v) - else: - v = col.db_type.convert(v) if v is not None else None - if i == 0: - keys.append(col.name) - values.append(v) - rows.append(tuple(values)) + rows, keys = self._get_rows(t, items[t.name], refs, _skip_extra) insert(db, self.translate, t.name, keys, *rows) for atkey, rows in refs.items(): insert(db, self.translate, atkey[0], atkey[1:], *rows) db.commit() + + +@dataclasses.dataclass +class TableReadSpec: + """Bundles data informing the reading of table rows.""" + table: TableSpec + name: str + translate: SchemaTranslator + converters: dict[str, tuple[str, Callable]] = dataclasses.field(default_factory=dict) + separators: dict[str, str] = dataclasses.field(default_factory=dict) + references: dict = dataclasses.field(default_factory=lambda: collections.defaultdict(dict)) + + def __post_init__(self): + # Assemble the conversion dictionary: + for col in self.table.columns: + if col.csvw_type in TYPE_MAP: + conv = TYPE_MAP[col.csvw_type].convert + else: + conv = DATATYPES[col.csvw_type].to_python + self.converters[self.translate(self.name, col.name)] = (col.name, conv) + if col.separator: + if col.csvw_type == 'string': + self.separators[self.translate(self.name, col.name)] = col.separator + else: + self.separators[self.translate(self.name, col.name)] = 'json' + + def read_row(self, row: Iterator[tuple[str, Any]]) -> collections.OrderedDict[str, Any]: + """Read a table according to spec.""" + d = collections.OrderedDict() + for k, v in row: + if k in self.separators: + if v is None: + d[k] = None + elif not v: + d[k] = [] + elif self.separators[k] == 'json': + d[k] = json.loads(v) + else: + d[k] = [self.converters[k][1](v_) for v_ in (v or '').split(self.separators[k])] + else: + d[k] = self.converters[k][1](v) if v is not None else None + pk = d[self.translate(self.name, self.table.primary_key[0])] \ + if self.table.primary_key and len(self.table.primary_key) == 1 else None + d.update({k: [] for k in self.table.many_to_many}) + d.update(self.references.get(pk, {})) + return d diff --git a/src/csvw/jsonld.py b/src/csvw/jsonld.py index 8087b4e..68467af 100644 --- a/src/csvw/jsonld.py +++ b/src/csvw/jsonld.py @@ -1,11 +1,15 @@ +""" +Functionality to transform CSVW row values to RDF. +""" import re import json import math -import typing +from typing import TYPE_CHECKING, Any, Union import decimal import pathlib import datetime import collections +from collections.abc import Iterable import dataclasses from rdflib import Graph, URIRef, Literal @@ -14,10 +18,13 @@ from .utils import is_url +if TYPE_CHECKING: + from .metadata import Table, Column + __all__ = ['group_triples', 'to_json', 'Triple', 'format_value'] -def format_value(value, col): +def format_value(value: Any, col: 'Column') -> str: # pylint: disable=R0911 """ Format values as JSON-LD literals. """ @@ -29,24 +36,24 @@ def format_value(value, col): res = re.sub('T[0-9.:]+', '', res) if isinstance(value, (datetime.datetime, datetime.time)): stamp, _, milliseconds = res.partition('.') - return '{}.{}'.format(stamp, milliseconds.rstrip('0')) if milliseconds \ + return f'{stamp}.{milliseconds.rstrip('0')}' if milliseconds \ else stamp.replace('+00:00', 'Z') return res # pragma: no cover if isinstance(value, datetime.timedelta): return col.datatype.formatted(value) if isinstance(value, Duration): return col.datatype.formatted(value) - if isinstance(value, decimal.Decimal): - value = float(value) if isinstance(value, URIReference): return value.unsplit() if isinstance(value, bytes): return col.datatype.formatted(value) if isinstance(value, pathlib.Path): return str(value) + if isinstance(value, decimal.Decimal): + value = float(value) if isinstance(value, float): return 'NaN' if math.isnan(value) else ( - '{}INF'.format('-' if value < 0 else '') if math.isinf(value) else value) + f"{'-' if value < 0 else ''}INF" if math.isinf(value) else value) return value @@ -59,36 +66,46 @@ class Triple: property: str value: str - def as_rdflib_triple(self): + def as_rdflib_triple(self) -> tuple[URIRef, URIRef, Union[URIRef, Literal]]: + """The triple suitable for inclusion in an rdflib.Graph.""" return ( URIRef(self.about), URIRef(self.property), URIRef(self.value) if is_url(self.value) else Literal(self.value)) @classmethod - def from_col(cls, table, col, row, prop, val, rownum): + def from_col( # pylint: disable=R0913,R0917 + cls, + table: 'Table', + col: 'Column', + row: collections.OrderedDict[str, Any], + prop: str, + val: Any, + rownum: int, + ) -> 'Triple': """ - + Instantiate a triple from the data (and metadata) of a column value. """ _name = col.header if col else None - propertyUrl = col.propertyUrl if col else table.inherit('propertyUrl') + propertyUrl = col.propertyUrl if col \ + else table.inherit('propertyUrl') # pylint: disable=C0103 if propertyUrl: prop = table.expand(propertyUrl, row, _row=rownum, _name=_name, qname=True) is_type = prop == 'rdf:type' - valueUrl = col.valueUrl if col else table.inherit('valueUrl') + valueUrl = col.valueUrl if col else table.inherit('valueUrl') # pylint: disable=C0103 if valueUrl: val = table.expand(valueUrl, row, _row=rownum, _name=_name, qname=is_type) val = format_value(val, col) s = None - aboutUrl = col.aboutUrl if col else None + aboutUrl = col.aboutUrl if col else None # pylint: disable=invalid-name if aboutUrl: s = table.expand(aboutUrl, row, _row=rownum, _name=_name) or s return cls(about=s, property=prop, value=val) -def frame(data: list) -> list: +def frame(data: list[dict]) -> list: """ Inline referenced items to force a deterministic graph layout. @@ -131,13 +148,11 @@ def to_json(obj, flatten_list=False): return obj -def group_triples(triples: typing.Iterable[Triple]) -> typing.List[dict]: - """ - Group and frame triples into a `list` of JSON objects. - """ +def _merged_triples(triples: Iterable[Triple]) -> list[Triple]: merged = [] for triple in triples: if isinstance(triple.value, list): + # We check, whether a list-valued triple for the same property is already present. for t in merged: if t.property == triple.property and isinstance(t.value, list): t.value.extend(triple.value) @@ -146,25 +161,35 @@ def group_triples(triples: typing.Iterable[Triple]) -> typing.List[dict]: merged.append(triple) else: merged.append(triple) + return merged - grouped = collections.OrderedDict() - triples = [] - # First pass: get top-level properties. - for triple in merged: + +def _extract_grouped_triples(triples) -> tuple[collections.OrderedDict[str, Triple], list[Triple]]: + """Return triples grouped by property and purge these from `triples`.""" + grouped, rem = collections.OrderedDict(), [] + for triple in triples: if triple.about is None and triple.property == '@id': grouped[triple.property] = triple.value - else: - if not triple.about: - # For test48 - if triple.property in grouped: - if not isinstance(grouped[triple.property], list): - grouped[triple.property] = [grouped[triple.property]] - grouped[triple.property].append(triple.value) - else: - grouped[triple.property] = triple.value + continue + if not triple.about: + # For test48 + if triple.property in grouped: + if not isinstance(grouped[triple.property], list): + grouped[triple.property] = [grouped[triple.property]] + grouped[triple.property].append(triple.value) else: - triples.append(triple) - if not triples: + grouped[triple.property] = triple.value + continue + rem.append(triple) + return grouped, rem + + +def group_triples(triples: Iterable[Triple]) -> list[dict]: + """ + Group and frame triples into a `list` of JSON objects. + """ + grouped, triples = _extract_grouped_triples(_merged_triples(triples)) + if not triples: # All grouped. return [grouped] g = Graph() @@ -174,6 +199,7 @@ def group_triples(triples: typing.Iterable[Triple]) -> typing.List[dict]: for prop, val in grouped.items(): if prop != '@id': g.add(Triple(about=grouped['@id'], property=prop, value=val).as_rdflib_triple()) + res = g.serialize(format='json-ld') # Frame and simplify the resulting objects, augment with list index: res = [(i, to_json(v, flatten_list=True)) for i, v in enumerate(frame(json.loads(res)))] diff --git a/src/csvw/utils.py b/src/csvw/utils.py index 83d97fb..3228e14 100644 --- a/src/csvw/utils.py +++ b/src/csvw/utils.py @@ -44,7 +44,7 @@ def json_open(filename, mode='r', encoding='utf-8'): def get_json(fname) -> Union[list, dict]: fname = str(fname) if is_url(fname): - return requests.get(fname).json(object_pairs_hook=collections.OrderedDict) + return requests.get(fname, timeout=10).json(object_pairs_hook=collections.OrderedDict) with json_open(fname) as f: return json.load(f, object_pairs_hook=collections.OrderedDict) @@ -158,14 +158,13 @@ def htmlify(obj, key=None): For inclusion in tables we must use HTML for lists. """ if isinstance(obj, list): - return '
    {}
'.format( - ''.join('
  • {}
  • '.format(htmlify(item, key=key)) for item in obj)) + lis = ''.join(f'
  • {htmlify(item, key=key)}
  • ' for item in obj) + return f'
      {lis}
    ' if isinstance(obj, dict): items = [] for k, v in obj.items(): - items.append('
    {}
    {}
    '.format( - qname2link(k, html=True), html.escape(str(v)))) - return '
    {}
    '.format(''.join(items)) + items.append(f'
    {qname2link(k, html=True)}
    {html.escape(str(v))}
    ') + return f"
    {''.join(items)}
    " return str(obj) def properties(props): @@ -193,15 +192,15 @@ def colrow(col, fks, pk): if col.datatype.format: if re.fullmatch(r'[\w\s]+(\|[\w\s]+)*', col.datatype.format): dt += '
    Valid choices:
    ' - dt += ''.join(' `{}`'.format(w) for w in col.datatype.format.split('|')) + dt += ''.join(f' `{w}`' for w in col.datatype.format.split('|')) elif col.datatype.base == 'string': - dt += '
    Regex: `{}`'.format(col.datatype.format) + dt += f'
    Regex: `{col.datatype.format}`' if col.datatype.minimum: - dt += '
    ≥ {}'.format(col.datatype.minimum) + dt += f'
    ≥ {col.datatype.minimum}' if col.datatype.maximum: - dt += '
    ≤ {}'.format(col.datatype.maximum) + dt += f'
    ≤ {col.datatype.maximum}' if col.separator: - dt = 'list of {} (separated by `{}`)'.format(dt, col.separator) + dt = f'list of {dt} (separated by `{col.separator}`)' desc = col.common_props.get('dc:description', '').replace('\n', ' ') if pk and col.name in pk: From f1a1ace62aa0ff0e790422b638042beaa5473c02 Mon Sep 17 00:00:00 2001 From: Robert Forkel Date: Thu, 5 Mar 2026 14:42:56 +0100 Subject: [PATCH 09/17] more linting --- src/csvw/__init__.py | 4 +- src/csvw/__main__.py | 88 ++++++++++++++++++------------- src/csvw/_compat.py | 3 ++ src/csvw/datatypes.py | 53 +++++++++++-------- src/csvw/db.py | 120 ++++++++++++++++++++++++------------------ src/csvw/jsonld.py | 2 +- tests/test_db.py | 19 +++++++ 7 files changed, 174 insertions(+), 115 deletions(-) diff --git a/src/csvw/__init__.py b/src/csvw/__init__.py index e888301..96c0298 100644 --- a/src/csvw/__init__.py +++ b/src/csvw/__init__.py @@ -1,4 +1,6 @@ -# csvw - https://w3c.github.io/csvw/primer/ +""" +csvw - https://w3c.github.io/csvw/primer/ +""" from .metadata import ( TableGroup, Table, Column, ForeignKey, Link, NaturalLanguage, Datatype, URITemplate, CSVW, diff --git a/src/csvw/__main__.py b/src/csvw/__main__.py index 3f599f8..048554a 100644 --- a/src/csvw/__main__.py +++ b/src/csvw/__main__.py @@ -1,3 +1,6 @@ +""" +CLI for the csvw package. +""" import sys import json import shutil @@ -13,6 +16,7 @@ def parsed_args(desc, args, *argspecs): + """Add custom arguments to the parser and parse.""" if args is None: # pragma: no cover parser = argparse.ArgumentParser(description=desc) for kw, kwargs in argspecs: @@ -21,23 +25,25 @@ def parsed_args(desc, args, *argspecs): return args -def exit(ret, test=False): +def exit(ret, test=False): # pylint: disable=redefined-builtin + """We don't want to exit the test suite""" if test: return ret sys.exit(ret) # pragma: no cover def csvwdescribe(args=None, test=False): + """Describe a (set of) CSV file(s) with basic CSVW metadata.""" frictionless = shutil.which('frictionless') if not frictionless: # pragma: no cover raise ValueError('The frictionless command must be installed for this functionality!\n' 'Run `pip install frictionless` and try again.') args = parsed_args( - "Describe a (set of) CSV file(s) with basic CSVW metadata.", + csvwdescribe.__doc__, args, - (['--delimiter'], dict(default=None)), - (['csv'], dict(nargs='+', help="CSV files to describe as CSVW TableGroup")), + (['--delimiter'], {'default': None}), + (['csv'], {'nargs': '+', 'help': "CSV files to describe as CSVW TableGroup"}), ) fargs = ['describe', '--json'] if args.delimiter: @@ -53,19 +59,20 @@ def csvwdescribe(args=None, test=False): dp = json.loads(subprocess.check_output([frictionless] + fargs + args.csv)) if onefile: - dp = dict(resources=[dp], profile='data-package') + dp = {'resources': [dp], 'profile': 'data-package'} tg = TableGroup.from_frictionless_datapackage(dp) print(json.dumps(tg.asdict(), indent=4)) - return exit(0, test=test) + return exit(0, test=test) # pylint: disable=R1722 def csvwvalidate(args=None, test=False): + """Validate a (set of) CSV file(s) described by CSVW metadata.""" args = parsed_args( - "Validate a (set of) CSV file(s) described by CSVW metadata.", + csvwvalidate.__doc__, args, - (['url'], dict(help='URL or local path to CSV or JSON metadata file.')), - (['-v', '--verbose'], dict(action='store_true', default=False)), + (['url'], {'help': 'URL or local path to CSV or JSON metadata file.'}), + (['-v', '--verbose'], {'action': 'store_true', 'default': False}), ) ret = 0 try: @@ -83,15 +90,16 @@ def csvwvalidate(args=None, test=False): print(colored('FAIL', 'red', attrs=['bold'])) if args.verbose: print(colored(str(e), 'blue')) - return exit(ret, test=test) + return exit(ret, test=test) # pylint: disable=R1722 def csvw2datasette(args=None, test=False): + """Convert CSVW to data for datasette (https://datasette.io/).""" args = parsed_args( - "Convert CSVW to data for datasette (https://datasette.io/).", + csvw2datasette.__doc__, args, - (['url'], dict(help='URL or local path to CSV or JSON metadata file.')), - (['-o', '--outdir'], dict(type=pathlib.Path, default=pathlib.Path('.'))), + (['url'], {'help': 'URL or local path to CSV or JSON metadata file.'}), + (['-o', '--outdir'], {'type': pathlib.Path, 'default': pathlib.Path('.')}), ) dbname, mdname = 'datasette.db', 'datasette-metadata.json' csvw = CSVW(args.url) @@ -99,64 +107,68 @@ def csvw2datasette(args=None, test=False): db.write_from_tg() md = {} for k in ['title', 'description', 'license']: - if 'dc:{}'.format(k) in csvw.common_props: - md[k] = csvw.common_props['dc:{}'.format(k)] - # FIXME: flesh out, see https://docs.datasette.io/en/stable/metadata.html + if f'dc:{k}' in csvw.common_props: + md[k] = csvw.common_props[f'dc:{k}'] args.outdir.joinpath(mdname).write_text(json.dumps(md, indent=4)) - print("""Run - datasette {} --metadata {} -and open your browser at - http://localhost:8001/ -to browse the data. -""".format(args.outdir / dbname, args.outdir / mdname)) - return exit(0, test=test) + for line in [ + "Run", + f" datasette {args.outdir / dbname} --metadata {args.outdir / mdname}", + "and open your browser at", + " http://localhost:8001/", + "to browse the data.", + ]: + print(line) + return exit(0, test=test) # pylint: disable=R1722 def csvw2json(args=None, test=False): + """Convert CSVW to JSON, see https://w3c.github.io/csvw/csv2json/""" args = parsed_args( - "Convert CSVW to JSON, see https://w3c.github.io/csvw/csv2json/", + csvw2json.__doc__, args, - (['url'], dict(help='URL or local path to CSV or JSON metadata file.')), + (['url'], {'help': 'URL or local path to CSV or JSON metadata file.'}), ) csvw = CSVW(args.url) print(json.dumps(csvw.to_json(), indent=4)) - return exit(0, test=test) + return exit(0, test=test) # pylint: disable=R1722 def csvw2sqlite(args=None, test=False): # pragma: no cover + """Convert CSVW to SQLite""" args = parsed_args( - "Convert CSVW to JSON, see https://w3c.github.io/csvw/csv2json/", + csvw2sqlite.__doc__, args, ( ['url'], - dict(help='URL or local path to CSVW metadata file describing a TableGroup.\n\n' - 'Note that not all valid CSVW datasets can be converted to SQLite. One ' - 'limitation is that all tables which are referenced by foreign keys must ' - 'have a primary key.')), + {'help': 'URL or local path to CSVW metadata file describing a TableGroup.\n\n' + 'Note that not all valid CSVW datasets can be converted to SQLite. One ' + 'limitation is that all tables which are referenced by foreign keys must ' + 'have a primary key.'}), ( ['output'], - dict(help='Path for the generated SQLite database file.')), + {'help': 'Path for the generated SQLite database file.'}), ) tg = TableGroup.from_file(args.url) db = Database(tg, args.output) db.write_from_tg(_force=True) - return exit(0, test=test) + return exit(0, test=test) # pylint: disable=R1722 def csvw2markdown(args=None, test=False): + """Create a Markdown document containing the CSVW metadata in human readable form.""" args = parsed_args( "Convert CSVW to JSON, see https://w3c.github.io/csvw/csv2json/", args, ( ['url'], - dict(help='URL or local path to CSVW metadata file describing a TableGroup.\n\n' - 'Note that not all valid CSVW datasets can be converted to SQLite. One ' - 'limitation is that all tables which are referenced by foreign keys must ' - 'have a primary key.')), + {'help': 'URL or local path to CSVW metadata file describing a TableGroup.\n\n' + 'Note that not all valid CSVW datasets can be converted to SQLite. One ' + 'limitation is that all tables which are referenced by foreign keys must ' + 'have a primary key.'}), ) tg = TableGroup.from_file(args.url) print(metadata2markdown(tg, link_files=True)) - return exit(0, test=test) + return exit(0, test=test) # pylint: disable=R1722 if __name__ == '__main__': # pragma: no cover diff --git a/src/csvw/_compat.py b/src/csvw/_compat.py index 83cbebd..0386fff 100644 --- a/src/csvw/_compat.py +++ b/src/csvw/_compat.py @@ -1,3 +1,6 @@ +""" +Functionality to address python compatibility issues. +""" import re import sys import datetime diff --git a/src/csvw/datatypes.py b/src/csvw/datatypes.py index e1daf4e..ff43d7a 100644 --- a/src/csvw/datatypes.py +++ b/src/csvw/datatypes.py @@ -14,7 +14,7 @@ import json as _json import math import base64 -from typing import Optional, TYPE_CHECKING, Any +from typing import Optional, TYPE_CHECKING, Any, Callable import decimal as _decimal import binascii import datetime @@ -103,13 +103,14 @@ def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C # We wrap a regex specified as `format` property into a group and add `$` to # make sure the whole string is matched when validating. try: - return {'regex': re.compile(r'({})$'.format(datatype.format))} + return { + 'regex': re.compile(r'({})$'.format(datatype.format))} # pylint: disable=C0209 except re.error: warnings.warn('Invalid regex pattern as datatype format') return {} @staticmethod - def to_python(v, regex=None): # pylint: disable=C0116 + def to_python(v, regex=None, **_): # pylint: disable=C0116 if regex and not regex.match(v): string.value_error(v) return v @@ -134,7 +135,7 @@ class anyURI(string): # pylint: disable=invalid-name name = 'anyURI' @staticmethod - def to_python(v, regex=None): # pylint: disable=C0116 + def to_python(v, regex=None, **_): # pylint: disable=C0116 res = string.to_python(v, regex=regex) return rfc3986.URIReference.from_string(res.encode('utf-8')) @@ -169,7 +170,7 @@ class NMTOKEN(string): # pylint: disable=invalid-name name = "NMTOKEN" @staticmethod - def to_python(v, regex=None): # pylint: disable=C0116 + def to_python(v, regex=None, **_): # pylint: disable=C0116 v = string.to_python(v, regex=regex) if not re.fullmatch(r'[\w.:-]*', v): NMTOKEN.value_error(v) @@ -271,24 +272,25 @@ def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C return {'true': true, 'false': false} @staticmethod - def to_python(s, true=('true', '1'), false=('false', '0')): # pylint: disable=C0116 - if isinstance(s, bool) or s is None: - return s - if s in true: + def to_python(v, true=('true', '1'), false=('false', '0'), **_): # pylint: disable=C0116 + if isinstance(v, bool) or v is None: + return v + if v in true: return True - if s in false: + if v in false: return False - raise boolean.value_error(s) + raise boolean.value_error(v) @staticmethod - def to_string(v, true=('true', '1'), false=('false', '0')): # pylint: disable=C0116 + def to_string(v, true=('true', '1'), false=('false', '0'), **_): # pylint: disable=C0116 return (true if v else false)[0] -def with_tz(v, func, args, kw): +def with_tz(v, func: Callable[..., datetime.datetime], args: tuple, kw: dict) -> datetime.datetime: + """Handle timezone when parsing a datatime using func.""" tz_pattern = re.compile('(Z|[+-][0-2][0-9]:[0-5][0-9])$') tz = tz_pattern.search(v) - if tz: + if tz: # We split off the timezone and handle it separately. v = v[:tz.start()] tz = tz.groups()[0] res = func(v, *args, **kw) @@ -338,7 +340,7 @@ def _parse(v, cls, regex, tz_marker=None): return res @staticmethod - def to_python(v, regex=None, fmt=None, tz_marker=None, pattern=None): # pylint: disable=C0116 + def to_python(v, regex=None, tz_marker=None, pattern=None, **_): # pylint: disable=C0116 if pattern and regex: match = regex.match(v) if not match: @@ -349,7 +351,7 @@ def to_python(v, regex=None, fmt=None, tz_marker=None, pattern=None): # pylint: return dateTime._parse(v, datetime.datetime, regex, tz_marker=tz_marker) @staticmethod - def to_string(v, regex=None, pattern=None, **_): # pylint: disable=C0116 + def to_string(v, pattern=None, **_): # pylint: disable=C0116 if pattern: return babel.dates.format_datetime(v, tzinfo=v.tzinfo, format=pattern) return v.isoformat() @@ -380,12 +382,18 @@ def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C return dt_format_and_regex('yyyy-MM-dd') @staticmethod - def to_python(v, regex=None, fmt=None, tz_marker=None, pattern=None): # pylint: disable=C0116 + def to_python(v, # pylint: disable=C0116 + regex=None, tz_marker=None, pattern=None, fmt=None, **_): return with_tz( v.strip(), dateTime.to_python, [], {'regex': regex, 'fmt': fmt, 'pattern': pattern}) @staticmethod - def to_string(v, regex=None, fmt=None, tz_marker=None, pattern=None): # pylint: disable=C0116 + def to_string(v, # pylint: disable=C0116,W0221 + pattern=None, + regex=None, + tz_marker=None, + fmt=None, + **_): if pattern: return format_date(v, format=pattern, locale='en') return dateTime.to_string(v, regex=regex, fmt=fmt, tz_marker=tz_marker, pattern=pattern) @@ -420,14 +428,14 @@ def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C return dt_format_and_regex(datatype.format or 'HH:mm:ss', no_date=True) @staticmethod - def to_python(v, regex=None, fmt=None, tz_marker=None, pattern=None): # pylint: disable=C0116 + def to_python(v, regex=None, tz_marker=None, pattern=None, **_): # pylint: disable=C0116 if pattern and 'x' in pattern.lower(): return dateutil.parser.parse(f'{datetime.date.today().isoformat()}T{v}') assert regex is not None return with_tz(v, dateTime._parse, [datetime.datetime, regex], {'tz_marker': tz_marker}) @staticmethod - def to_string(v, regex=None, pattern=None, **_): # pylint: disable=C0116 + def to_string(v, pattern=None, **_): # pylint: disable=C0116 return babel.dates.format_time(v, tzinfo=v.tzinfo, format=pattern) @@ -886,7 +894,7 @@ class normalizedString(string): # pylint: disable=invalid-name name = 'normalizedString' @staticmethod - def to_python(v, regex=None): + def to_python(v, regex=None, **_): if v: for c in '\r\n\t': v = v.replace(c, ' ') @@ -1011,7 +1019,6 @@ def derived_description(datatype: "csvw.Datatype") -> dict: pass return {} - # FIXME: ignored **kw? # why not just to_python = staticmethod(_json.loads)? @staticmethod def to_python(v, schema=None, **_): # pylint: disable=W0237 @@ -1245,7 +1252,7 @@ def _get_significant(groups): significant, leadingzero, skip = [], False, True for c in ''.join(groups): - if c in ['+', '-', '%', # fixme: permil + if c in ['+', '-', '%', # fixme: permil # pylint: disable=fixme ]: continue if c == '0' and skip: diff --git a/src/csvw/db.py b/src/csvw/db.py index 41c16be..c4332f9 100644 --- a/src/csvw/db.py +++ b/src/csvw/db.py @@ -38,16 +38,17 @@ import csvw from csvw.datatypes import DATATYPES -from csvw.metadata import TableGroup +from csvw.metadata import TableGroup, Datatype from .utils import optional -def identity(s): +def identity(s): # pylint: disable=C0116 return s @dataclasses.dataclass class DBType: + """A DB datatype together with read/write converters.""" name: str convert: Callable[[Any], Any] = identity read: Callable[[Any], Any] = identity @@ -62,17 +63,18 @@ class DBType: } -class SchemaTranslator(Protocol): +class SchemaTranslator(Protocol): # pylint: disable=R0903,C0115 def __call__(self, table: str, column: Optional[str] = None) -> str: ... # pragma: no cover -class ColumnTranslator(Protocol): +class ColumnTranslator(Protocol): # pylint: disable=R0903,C0115 def __call__(self, column: str) -> str: ... # pragma: no cover def quoted(*names: str) -> str: + """Returns a comma-separated list of quoted schema object names.""" return ','.join(f'`{name}`' for name in names) @@ -94,13 +96,12 @@ def insert(db: sqlite3.Connection, a time, allowing for more focused debugging output in case of errors. """ if rows: - sql = "INSERT INTO {0} ({1}) VALUES ({2})".format( - quoted(translate(table)), - quoted(*[translate(table, k) for k in keys]), - ','.join(['?' for _ in keys])) + cols = quoted(*[translate(table, k) for k in keys]) + vals = ','.join(['?' for _ in keys]) + sql = f"INSERT INTO {quoted(translate(table))} ({cols}) VALUES ({vals})" try: db.executemany(sql, rows) - except: # noqa: E722 - this is purely for debugging. + except: # noqa: E722 - this is purely for debugging. pylint: disable=bare-except if not single: for row in rows: insert(db, translate, table, keys, row, single=True) @@ -111,6 +112,7 @@ def insert(db: sqlite3.Connection, def select(db: sqlite3.Connection, table: str) -> tuple[list[str], Sequence]: + """Shortcut to construct and execute simple SELECT statements.""" cu = db.execute(f"SELECT * FROM {quoted(table)}") cols = [d[0] for d in cu.description] return cols, list(cu.fetchall()) @@ -126,7 +128,7 @@ class ColSpec: separator: str = None db_type: DBType = None required: bool = False - csvw: str = None + csvw: Datatype = None def __post_init__(self): self.csvw_type = self.csvw_type or 'string' @@ -146,7 +148,7 @@ def check(self, translate: ColumnTranslator) -> Optional[str]: :return: A string suitable as argument of an SQL CHECK constraint. """ if not self.csvw: - return + return None c, cname = self.csvw, translate(self.name) constraints = [] if (c.minimum is not None) or (c.maximum is not None): @@ -156,24 +158,25 @@ def check(self, translate: ColumnTranslator) -> Optional[str]: }.get(self.csvw_type) if c.minimum is not None: if func: - constraints.append("{2}(`{0}`) >= {2}('{1}')".format(cname, c.minimum, func)) + constraints.append(f"{func}(`{cname}`) >= {func}('{c.minimum}')") else: - constraints.append('`{0}` >= {1}'.format(cname, c.minimum)) + constraints.append(f'`{cname}` >= {c.minimum}') if c.maximum is not None: if func: - constraints.append("{2}(`{0}`) <= {2}('{1}')".format(cname, c.maximum, func)) + constraints.append(f"{func}(`{cname}`) <= {func}('{c.maximum}')") else: - constraints.append('`{0}` <= {1}'.format(cname, c.maximum)) + constraints.append(f'`{cname}` <= {c.maximum}') elif any(cc is not None for cc in [c.length, c.minLength, c.maxLength]): if c.length: - constraints.append('length(`{0}`) = {1}'.format(cname, c.length)) + constraints.append(f'length(`{cname}`) = {c.length}') if c.minLength: - constraints.append('length(`{0}`) >= {1}'.format(cname, c.minLength)) + constraints.append(f'length(`{cname}`) >= {c.minLength}') if c.maxLength: - constraints.append('length(`{0}`) <= {1}'.format(cname, c.maxLength)) + constraints.append(f'length(`{cname}`) <= {c.maxLength}') return ' AND '.join(constraints) def sql(self, translate: ColumnTranslator) -> str: + """Format the column metadata suitable for inclusion in a CREATE TABLE statement.""" _check = self.check(translate) null_constraint = ' NOT NULL' if self.required else '' check_constraint = f' CHECK ({_check})' if _check else '' @@ -222,12 +225,11 @@ def from_table_metadata(cls, if len(fk.columnReference) == 1 and fk.columnReference[0] in list_valued: # List-valued foreign keys are turned into a many-to-many relation! assert len(fk.reference.columnReference) == 1, \ - 'Composite key {0} in table {1} referenced'.format( - fk.reference.columnReference, - fk.reference.resource) + (f'Composite key {fk.reference.columnReference} in table ' + f'{fk.reference.resource} referenced') assert spec.primary_key and len(spec.primary_key) == 1, \ - 'Table {0} referenced by list-valued foreign key must have non-composite ' \ - 'primary key'.format(spec.name) + (f'Table {spec.name} referenced by list-valued foreign key must have ' + f'non-composite primary key') spec.many_to_many[fk.columnReference[0]] = TableSpec.association_table( spec.name, spec.primary_key[0], @@ -261,13 +263,13 @@ def association_table(cls, atable, apk, btable, bpk) -> 'TableSpec': a column `context`, which stores the name of the foreign key column from which a row in the assocation table was created. """ - afk = ColSpec('{0}_{1}'.format(atable, apk)) - bfk = ColSpec('{0}_{1}'.format(btable, bpk)) + afk = ColSpec(f'{atable}_{apk}') + bfk = ColSpec(f'{btable}_{bpk}') if afk.name == bfk.name: afk.name += '_1' bfk.name += '_2' return cls( - name='{0}_{1}'.format(atable, btable), + name=f'{atable}_{btable}', columns=[afk, bfk, ColSpec('context')], foreign_keys=[ ([afk.name], atable, [apk]), @@ -281,17 +283,21 @@ def sql(self, translate: SchemaTranslator) -> str: :return: The SQL statement to create the table. """ col_translate = functools.partial(translate, self.name) + # Assemble the column specifications: clauses = [col.sql(col_translate) for col in self.columns] + # Then add the constraints: if self.primary_key: - clauses.append('PRIMARY KEY({0})'.format(quoted( - *[col_translate(c) for c in self.primary_key]))) + qcols = quoted(*[col_translate(c) for c in self.primary_key]) + clauses.append(f'PRIMARY KEY({qcols})') for fk, ref, refcols in self.foreign_keys: - clauses.append('FOREIGN KEY({0}) REFERENCES {1}({2}) ON DELETE CASCADE'.format( - quoted(*[col_translate(c) for c in fk]), - quoted(translate(ref)), - quoted(*[translate(ref, c) for c in refcols]))) - return "CREATE TABLE IF NOT EXISTS `{0}` (\n {1}\n)".format( - translate(self.name), ',\n '.join(clauses)) + fkcols = quoted(*[col_translate(c) for c in fk]) + rtable = quoted(translate(ref)) + pkcols = quoted(*[translate(ref, c) for c in refcols]) + clauses.append(f'FOREIGN KEY({fkcols}) REFERENCES {rtable}({pkcols}) ON DELETE CASCADE') + + clauses = ',\n '.join(clauses) + return '\n'.join([ + f"CREATE TABLE IF NOT EXISTS `{translate(self.name)}` (", f"{clauses}", ")"]) def schema(tg: csvw.TableGroup, @@ -307,7 +313,7 @@ def schema(tg: csvw.TableGroup, :return: A pair (tables, reference_tables). """ tables = {} - for tname, table in tg.tabledict.items(): + for table in tg.tabledict.values(): t = TableSpec.from_table_metadata( table, drop_self_referential_fks=drop_self_referential_fks) tables[t.name] = t @@ -333,7 +339,7 @@ def schema(tg: csvw.TableGroup, return list(ordered.values()) -class Database(object): +class Database: """ Represents a SQLite database associated with a :class:`csvw.TableGroup` instance. @@ -364,13 +370,14 @@ def __init__( self.init_schema(tg, drop_self_referential_fks=drop_self_referential_fks) self._connection = None # For in-memory dbs we need to keep the connection! - def init_schema(self, tg, drop_self_referential_fks=True): + def init_schema(self, tg: TableGroup, drop_self_referential_fks: bool = True): + """Inititialize the db schema, possibly ignoring self-referential foreign keys.""" self.tg = tg self.tables = schema( self.tg, drop_self_referential_fks=drop_self_referential_fks) if self.tg else [] @property - def tdict(self) -> dict[str, TableSpec]: + def tdict(self) -> dict[str, TableSpec]: # pylint: disable=C0116 return {t.name: t for t in self.tables} @staticmethod @@ -387,24 +394,30 @@ def name_translator(table: str, column: Optional[str] = None) -> str: return column or table def connection(self) -> Union[sqlite3.Connection, contextlib.closing]: + """DB connection to be used as context manager.""" if self.fname: return contextlib.closing(sqlite3.connect(str(self.fname))) if not self._connection: self._connection = sqlite3.connect(':memory:') return self._connection - def select_many_to_many(self, db, table, context) -> dict: + def _qt(self, tname: str, cname: Optional[str] = None) -> str: + """Translate and then quote a db schema object.""" + if cname: + return quoted(self.translate(tname, cname)) + return quoted(self.translate(tname)) + + def select_many_to_many(self, db, table, context) -> dict[str, Union[tuple[str, str], str]]: + """Select data from an association table, grouped by first foreign key.""" if context is not None: - context_sql = "WHERE context = '{0}'".format(context) + context_sql = f"WHERE context = '{context}'" else: context_sql = '' - sql = """\ -SELECT {0}, group_concat({1}, ' '), group_concat(COALESCE(context, ''), '||') -FROM {2} {3} GROUP BY {0}""".format( - quoted(self.translate(table.name, table.columns[0].name)), - quoted(self.translate(table.name, table.columns[1].name)), - quoted(self.translate(table.name)), - context_sql) + qt = functools.partial(self._qt, table.name) + sql = (f"SELECT {qt(table.columns[0].name)}, " + f" group_concat({qt(table.columns[1].name)}, ' '), " + f" group_concat(COALESCE(context, ''), '||') " + f"FROM {qt()} {context_sql} GROUP BY {qt(table.columns[0].name)}") cu = db.execute(sql) return { r[0]: [(k, v) if context is None else k @@ -419,8 +432,10 @@ def separator(self, tname: str, cname: str) -> Optional[str]: for col in self.tdict[name].columns: if self.translate(name, col.name) == cname: return col.separator + return None - def split_value(self, tname, cname, value) -> Union[list[str], str, None]: + def split_value(self, tname: str, cname: str, value) -> Union[list[str], str, None]: + """Split a value if a separator is defined for the column.""" sep = self.separator(tname, cname) return (value or '').split(sep) if sep else value @@ -433,8 +448,8 @@ def read(self) -> dict[str, list[collections.OrderedDict]]: with self.connection() as conn: for tname in self.tg.tabledict: # - # FIXME: how much do we want to use DB types? Probably as much as possible! - # Thus we need to convert on write **and** read! + # How much do we want to use DB types? Probably as much as possible! + # Thus we'd need to convert on write **and** read! # spec = TableReadSpec(self.tdict[tname], tname, self.translate) # Retrieve the many-to-many relations: @@ -484,9 +499,9 @@ def _get_rows(self, t, items, refs, _skip_extra): [c.name for c in t.many_to_many[k].columns]) # We distinguish None - meaning NULL - and [] - meaning no items - as # values of list-valued columns. - refs[atkey] = [ + refs[atkey].extend([ tuple([pk] + list(self.association_table_context(t, k, vv))) - for vv in (v or [])] + for vv in (v or [])]) else: if k not in cols: if _skip_extra: @@ -532,6 +547,7 @@ def write(self, *, force=False, _exists_ok=False, _skip_extra=False, **items): continue rows, keys = self._get_rows(t, items[t.name], refs, _skip_extra) insert(db, self.translate, t.name, keys, *rows) + print(refs) for atkey, rows in refs.items(): insert(db, self.translate, atkey[0], atkey[1:], *rows) diff --git a/src/csvw/jsonld.py b/src/csvw/jsonld.py index 68467af..c9ed405 100644 --- a/src/csvw/jsonld.py +++ b/src/csvw/jsonld.py @@ -36,7 +36,7 @@ def format_value(value: Any, col: 'Column') -> str: # pylint: disable=R0911 res = re.sub('T[0-9.:]+', '', res) if isinstance(value, (datetime.datetime, datetime.time)): stamp, _, milliseconds = res.partition('.') - return f'{stamp}.{milliseconds.rstrip('0')}' if milliseconds \ + return f'{stamp}.{milliseconds.rstrip("0")}' if milliseconds \ else stamp.replace('+00:00', 'Z') return res # pragma: no cover if isinstance(value, datetime.timedelta): diff --git a/tests/test_db.py b/tests/test_db.py index d134741..9b68b8f 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -244,6 +244,25 @@ def test_many_to_many(tg_with_foreign_keys): db._connection.close() +def test_many_to_many_2(tg_with_foreign_keys): + db = Database(tg_with_foreign_keys) + db.write( + ref=[ + {'pk': '1', 'ref1': ['y', 'x']}, + {'pk': '2', 'ref1': ['x']}, + ], + data=[{'v': 'x'}, {'v': 'y'}]) + + res = db.read()['ref'][0] + # Associations between the same pair of tables are grouped by foreign key column: + assert res['ref1'] == ['y', 'x'] + assert res['ref2'] == [] + res = db.read()['ref'][1] + # Associations between the same pair of tables are grouped by foreign key column: + assert res['ref1'] == ['x'] + db._connection.close() + + def test_many_to_many_no_context(tg_with_foreign_keys): class DatabaseWithoutContext(Database): def association_table_context(self, table, column, fkey): From 0a07046f41e50f5233d57d8b1d856ff810a7a80d Mon Sep 17 00:00:00 2001 From: Robert Forkel Date: Fri, 6 Mar 2026 07:50:09 +0100 Subject: [PATCH 10/17] more linting --- src/csvw/__main__.py | 2 +- src/csvw/db.py | 6 +- src/csvw/dsv.py | 56 +++++++---- src/csvw/dsv_dialects.py | 47 ++++++---- src/csvw/frictionless.py | 138 ++++++++++++++------------- src/csvw/metadata.py | 50 +++++----- src/csvw/metadata_utils.py | 143 +++++++++++++++++++++++++++- src/csvw/utils.py | 185 +++++++------------------------------ tests/test_utils.py | 4 - 9 files changed, 346 insertions(+), 285 deletions(-) diff --git a/src/csvw/__main__.py b/src/csvw/__main__.py index 048554a..db87a8e 100644 --- a/src/csvw/__main__.py +++ b/src/csvw/__main__.py @@ -12,7 +12,7 @@ from csvw import CSVW, TableGroup from csvw.db import Database -from csvw.utils import metadata2markdown +from csvw.metadata_utils import metadata2markdown def parsed_args(desc, args, *argspecs): diff --git a/src/csvw/db.py b/src/csvw/db.py index c4332f9..51e0704 100644 --- a/src/csvw/db.py +++ b/src/csvw/db.py @@ -39,7 +39,7 @@ import csvw from csvw.datatypes import DATATYPES from csvw.metadata import TableGroup, Datatype -from .utils import optional +from .utils import optcast def identity(s): # pylint: disable=C0116 @@ -57,8 +57,8 @@ class DBType: TYPE_MAP = { 'string': DBType('TEXT'), 'integer': DBType('INTEGER'), - 'boolean': DBType('INTEGER', optional(int), optional(bool)), - 'decimal': DBType('REAL', optional(float), optional(decimal.Decimal)), + 'boolean': DBType('INTEGER', optcast(int), optcast(bool)), + 'decimal': DBType('REAL', optcast(float), optcast(decimal.Decimal)), 'hexBinary': DBType('BLOB'), } diff --git a/src/csvw/dsv.py b/src/csvw/dsv.py index fd1cf64..858fc0d 100644 --- a/src/csvw/dsv.py +++ b/src/csvw/dsv.py @@ -35,13 +35,16 @@ PathType = Union[str, pathlib.Path] LinesOrPath = Union[PathType, IO, Iterable[str]] +# Note: The value for restkey is a list of all surplus column values. +DictRowType = collections.OrderedDict[str, Union[str, list[str]]] def normalize_encoding(encoding: str) -> str: + """Normalize the name of the encoding.""" return codecs.lookup(encoding).name -class UnicodeWriter: +class UnicodeWriter: # pylint: disable=too-many-instance-attributes """ Write Unicode data to a csv file. @@ -91,6 +94,7 @@ def _escapedoubled(row): self._escapedoubled = _escapedoubled self._close = False self._rows_written = 0 + self.writer = None def __enter__(self): if isinstance(self.f, (str, pathlib.Path)): @@ -116,11 +120,12 @@ def read(self) -> Optional[bytes]: return self.f.read().encode('utf-8') return None # pragma: no cover - def __exit__(self, type, value, traceback): + def __exit__(self, type_, value, traceback): if self._close: self.f.close() def writerow(self, row: Iterable[Union[str, None]]): + """Write multiple rows.""" self.writer.writerow(self._escapedoubled(row)) self._rows_written += 1 @@ -144,7 +149,7 @@ def writerows(self, rows: Iterable[Union[tuple, list, dict]]): self.writerow(row) -class UnicodeReader: +class UnicodeReader: # pylint: disable=too-many-instance-attributes """ Read Unicode data from a csv file. @@ -174,6 +179,8 @@ def __init__( self.encoding = normalize_encoding(kw.pop('encoding', 'utf-8-sig')) self.newline = kw.pop('lineterminator', None) self.dialect = dialect if isinstance(dialect, Dialect) else None + self.lineno = None + self.reader = None if self.dialect: self.encoding = self.dialect.python_encoding self.kw = dialect.as_python_formatting_parameters() @@ -218,7 +225,7 @@ def _next_row(self): row = [ s if isinstance(s, str) else s.decode(self._reader_encoding) for s in next(self.reader)] - self.lineno += sum([list(s).count('\n') for s in row]) + self.lineno += sum(list(s).count('\n') for s in row) return row def __next__(self): @@ -247,7 +254,7 @@ def __iter__(self): return self -class UnicodeReaderWithLineNumber(UnicodeReader): +class UnicodeReaderWithLineNumber(UnicodeReader): # pylint: disable=too-few-public-methods """ A `UnicodeReader` yielding (lineno, row) pairs, where "lineno" is the 1-based number of the the **text line** where the (possibly multi-line) row data starts in the DSV file. @@ -257,7 +264,7 @@ def __next__(self): :return: a pair (1-based line number in the input, row) """ # Retrieve the row, thereby incrementing the line number: - row = super(UnicodeReaderWithLineNumber, self).__next__() + row = super().__next__() return self.lineno + 1, row @@ -284,19 +291,26 @@ class UnicodeDictReader(UnicodeReader): """ - def __init__(self, f, fieldnames=None, restkey=None, restval=None, **kw): + def __init__( + self, + f, + fieldnames: Optional[list[str]] = None, + restkey: Optional[str] = None, + restval: Optional[str] = None, + **kw): self._fieldnames = fieldnames # list of keys for the dict self.restkey = restkey # key to catch long rows self.restval = restval # default value for short rows self.line_num = 0 - super(UnicodeDictReader, self).__init__(f, **kw) + super().__init__(f, **kw) @property - def fieldnames(self): + def fieldnames(self) -> Optional[list[str]]: + """Get the fieldnames, i.e. the dictionary keys for the rows.""" if self._fieldnames is None: - try: + try: # Read the first row. self._fieldnames = super().__next__() - except StopIteration: + except StopIteration: # No rows, so no fieldnames is ok. pass self.line_num = self.reader.line_num if self._fieldnames: @@ -304,10 +318,10 @@ def fieldnames(self): warnings.warn('Duplicate column names!') return self._fieldnames - def __next__(self) -> collections.OrderedDict: + def __next__(self) -> DictRowType: if self.line_num == 0: # Used only for its side effect. - self.fieldnames + self.fieldnames # pylint: disable=pointless-statement row = super().__next__() self.line_num = self.reader.line_num @@ -318,7 +332,8 @@ def __next__(self) -> collections.OrderedDict: row = super().__next__() return self.item(row) - def item(self, row) -> collections.OrderedDict[str, str]: + def item(self, row) -> DictRowType: + """Turn a row into a dict.""" d = collections.OrderedDict((k, v) for k, v in zip(self.fieldnames, row)) lf = len(self.fieldnames) lr = len(row) @@ -344,10 +359,14 @@ class NamedTupleReader(UnicodeDictReader): @functools.cached_property def cls(self): + """ + Creates a namedtuple class suitable for the columns of the CSV content. + """ fieldnames = list(map(self._normalize_fieldname, self.fieldnames)) return collections.namedtuple('Row', fieldnames) def item(self, row): + """Create a namedtuple from a row.""" d = UnicodeDictReader.item(self, row) for name in self.fieldnames: d.setdefault(name, None) @@ -372,7 +391,7 @@ def iterrows(lines_or_file: LinesOrPath, """ if namedtuples and dicts: raise ValueError('either namedtuples or dicts can be chosen as output format') - elif namedtuples: + if namedtuples: _reader = NamedTupleReader elif dicts: _reader = UnicodeDictReader @@ -394,7 +413,7 @@ def rewrite(fname: PathType, visitor: Callable[[int, list[str]], Union[None, lis (modified) row or None to filter out the row. :param kw: Keyword parameters are passed through to csv.reader/csv.writer. """ - fname = utils.ensure_path(fname) + fname = pathlib.Path(fname) assert fname.is_file() with tempfile.NamedTemporaryFile(delete=False) as fp: tmp = pathlib.Path(fp.name) @@ -409,10 +428,13 @@ def rewrite(fname: PathType, visitor: Callable[[int, list[str]], Union[None, lis def add_rows(fname: PathType, *rows: list[str]): + """ + Add rows to a CSV file. + """ with tempfile.NamedTemporaryFile(delete=False) as fp: tmp = pathlib.Path(fp.name) - fname = utils.ensure_path(fname) + fname = pathlib.Path(fname) with UnicodeWriter(tmp) as writer: if fname.exists(): with UnicodeReader(fname) as reader_: diff --git a/src/csvw/dsv_dialects.py b/src/csvw/dsv_dialects.py index cc0b320..288d8c0 100644 --- a/src/csvw/dsv_dialects.py +++ b/src/csvw/dsv_dialects.py @@ -9,7 +9,7 @@ - ``_ - ``_ """ -import typing +from typing import Callable, Literal import warnings import functools import dataclasses @@ -25,7 +25,7 @@ def convert_encoding(s): - s = utils.converter(str, 'utf-8', s) + s = utils.type_checker(str, 'utf-8', s) try: _ = 'x'.encode(ENCODING_MAP.get(s, s)) return s @@ -54,44 +54,46 @@ class Dialect: skipColumns: int = 0 skipBlankRows: bool = False skipInitialSpace: bool = False - trim: typing.Literal['true', 'false', 'start', 'end'] = 'false' + trim: Literal['true', 'false', 'start', 'end'] = 'false' def __post_init__(self): self.encoding = convert_encoding(self.encoding) - self.line_terminators = utils.converter(list, ['\r\n', '\n'], self.line_terminators) - self.quoteChar = utils.converter(str, '"', self.quoteChar, allow_none=True) - self.doubleQuote = utils.converter(bool, True, self.doubleQuote) - self.skipRows = utils.converter(int, 0, self.skipRows, cond=lambda s: s >= 0) - self.commentPrefix = utils.converter(str, '#', self.commentPrefix, allow_none=True) - self.header = utils.converter(bool, True, self.header) - self.headerRowCount = utils.converter( + self.line_terminators = utils.type_checker(list, ['\r\n', '\n'], self.line_terminators) + self.quoteChar = utils.type_checker(str, '"', self.quoteChar, allow_none=True) + self.doubleQuote = utils.type_checker(bool, True, self.doubleQuote) + self.skipRows = utils.type_checker(int, 0, self.skipRows, cond=lambda s: s >= 0) + self.commentPrefix = utils.type_checker(str, '#', self.commentPrefix, allow_none=True) + self.header = utils.type_checker(bool, True, self.header) + self.headerRowCount = utils.type_checker( int, 1, self.headerRowCount, cond=lambda s: s >= 0) - self.delimiter = utils.converter(str, ',', self.delimiter) - self.skipColumns = utils.converter(int, 0, self.skipColumns, cond=lambda s: s >= 0) - self.skipBlankRows = utils.converter(bool, False, self.skipBlankRows) - self.skipInitialSpace = utils.converter(bool, False, self.skipInitialSpace) - self.trim = utils.converter( + self.delimiter = utils.type_checker(str, ',', self.delimiter) + self.skipColumns = utils.type_checker(int, 0, self.skipColumns, cond=lambda s: s >= 0) + self.skipBlankRows = utils.type_checker(bool, False, self.skipBlankRows) + self.skipInitialSpace = utils.type_checker(bool, False, self.skipInitialSpace) + self.trim = utils.type_checker( (str, bool), 'false', str(self.trim).lower() if isinstance(self.trim, bool) else self.trim) assert self.trim in ['true', 'false', 'start', 'end'], 'invalid trim' - def updated(self, **kw): + def updated(self, **kw) -> 'Dialect': + """Update the spec, returning a new updated object.""" res = self.__class__(**dataclasses.asdict(self)) for k, v in kw.items(): setattr(res, k, v) return res @functools.cached_property - def escape_character(self): + def escape_character(self): # pylint: disable=C0116 return None if self.quoteChar is None else ('"' if self.doubleQuote else '\\') @functools.cached_property - def line_terminators(self): + def line_terminators(self) -> list[str]: # pylint: disable=C0116 return [self.lineTerminators] \ if isinstance(self.lineTerminators, str) else self.lineTerminators @functools.cached_property - def trimmer(self): + def trimmer(self) -> Callable[[str], str]: + """Map trim spec to a callable to do the trimming.""" return { True: lambda s: s.strip(), 'true': lambda s: s.strip(), @@ -102,13 +104,20 @@ def trimmer(self): }[self.trim] def asdict(self, omit_defaults=True): + """The dialect spec as dict suitable for JSON serialization.""" return dataclass_asdict(self, omit_defaults=omit_defaults) @property def python_encoding(self): + """ + Turn the encoding name into something understood by python. + """ return ENCODING_MAP.get(self.encoding, self.encoding) def as_python_formatting_parameters(self): + """ + Turn the dialect spec into a dict suitable as kwargs for Python's csv implementation. + """ return { 'delimiter': self.delimiter, 'doublequote': self.doubleQuote, diff --git a/src/csvw/frictionless.py b/src/csvw/frictionless.py index 3bd0200..b0bc87d 100644 --- a/src/csvw/frictionless.py +++ b/src/csvw/frictionless.py @@ -10,58 +10,66 @@ """ import json import pathlib +from typing import Any, TYPE_CHECKING, Optional + +if TYPE_CHECKING: + from csvw.metadata import TableGroup + + +def _convert_numeric_datatype(spec): + datatype = {'base': spec['type']} + if spec['type'] == 'string' and spec.get('format'): + datatype['dc:format'] = spec['format'] + if spec['type'] == 'boolean' and spec.get('trueValues') and spec.get('falseValues'): + datatype['format'] = f"{spec['trueValues'][0]}|{spec['falseValues'][0]}" + if spec['type'] in ['number', 'integer']: + if spec.get('bareNumber') is True: # pragma: no cover + raise NotImplementedError( + 'bareNumber is not supported in CSVW. It may be possible to translate to ' + 'a number pattern, though. See ' + 'https://www.w3.org/TR/2015/REC-tabular-data-model-20151217/' + '#formats-for-numeric-types') + if any(prop in spec for prop in ['decimalChar', 'groupChar']): + datatype['format'] = {} + for p in ['decimalChar', 'groupChar']: + if spec.get(p): + datatype['format'][p] = spec[p] + return datatype + + +def _convert_datatype(spec): # pylint: disable=too-many-return-statements + typemap = { + 'year': 'gYear', + 'yearmonth': 'gYearMonth', + } + if 'type' in spec: + if spec['type'] == 'string' and spec.get('format') == 'binary': + return {'base': 'binary'} + if spec['type'] == 'string' and spec.get('format') == 'uri': + return {'base': 'anyURI'} + if spec['type'] in typemap: + return {'base': typemap[spec['type']]} + if spec['type'] in [ + 'string', 'number', 'integer', 'boolean', 'date', 'time', 'datetime', 'duration', + ]: + return _convert_numeric_datatype(spec) + if spec['type'] in ['object', 'array']: + return {'base': 'json', 'dc:format': 'application/json'} + if spec['type'] == 'geojson': + return {'base': 'json', 'dc:format': 'application/geo+json'} + return {'base': 'string'} -def convert_column_spec(spec): +def convert_column_spec(spec: dict[str, Any]) -> dict[str, Any]: """ https://specs.frictionlessdata.io/table-schema/#field-descriptors :param spec: :return: """ - typemap = { - 'year': 'gYear', - 'yearmonth': 'gYearMonth', - } + res = {'name': spec['name'], 'datatype': _convert_datatype(spec)} titles = [t for t in [spec.get('title')] if t] - - res = {'name': spec['name'], 'datatype': {'base': 'string'}} - if 'type' in spec: - if spec['type'] == 'string' and spec.get('format') == 'binary': - res['datatype']['base'] = 'binary' - elif spec['type'] == 'string' and spec.get('format') == 'uri': - res['datatype']['base'] = 'anyURI' - elif spec['type'] in typemap: - res['datatype']['base'] = typemap[spec['type']] - elif spec['type'] in [ - 'string', 'number', 'integer', 'boolean', 'date', 'time', 'datetime', 'duration', - ]: - res['datatype']['base'] = spec['type'] - if spec['type'] == 'string' and spec.get('format'): - res['datatype']['dc:format'] = spec['format'] - if spec['type'] == 'boolean' and spec.get('trueValues') and spec.get('falseValues'): - res['datatype']['format'] = '{}|{}'.format( - spec['trueValues'][0], spec['falseValues'][0]) - if spec['type'] in ['number', 'integer']: - if spec.get('bareNumber') is True: # pragma: no cover - raise NotImplementedError( - 'bareNumber is not supported in CSVW. It may be possible to translate to ' - 'a number pattern, though. See ' - 'https://www.w3.org/TR/2015/REC-tabular-data-model-20151217/' - '#formats-for-numeric-types') - if any(prop in spec for prop in ['decimalChar', 'groupChar']): - res['datatype']['format'] = {} - for p in ['decimalChar', 'groupChar']: - if spec.get(p): - res['datatype']['format'][p] = spec[p] - elif spec['type'] in ['object', 'array']: - res['datatype']['base'] = 'json' - res['datatype']['dc:format'] = 'application/json' - elif spec['type'] == 'geojson': - res['datatype']['base'] = 'json' - res['datatype']['dc:format'] = 'application/geo+json' - if titles: res['titles'] = titles if 'description' in spec: @@ -75,24 +83,25 @@ def convert_column_spec(spec): res['datatype'][prop] = constraints[prop] if ('pattern' in constraints) and ('format' not in res['datatype']): res['datatype']['format'] = constraints['pattern'] - # FIXME: we could transform the "enum" constraint for string into + # We could transform the "enum" constraint for string into # a regular expression in the "format" property. return res -def convert_foreignKey(rsc_name, fk, resource_map): +def convert_foreignKey( # pylint: disable=C0103 + rsc_name: str, fk: dict, resource_map: dict) -> dict[str, Any]: """ https://specs.frictionlessdata.io/table-schema/#foreign-keys """ # Rename "fields" to "columnReference" and map resource name to url (resolving self-referential # foreign keys). - return dict( - columnReference=fk['fields'], - reference=dict( - columnReference=fk['reference']['fields'], - resource=resource_map[fk['reference']['resource'] or rsc_name], - ) - ) + return { + 'columnReference': fk['fields'], + 'reference': { + 'columnReference': fk['reference']['fields'], + 'resource': resource_map[fk['reference']['resource'] or rsc_name], + } + } def convert_table_schema(rsc_name, schema, resource_map): @@ -104,9 +113,7 @@ def convert_table_schema(rsc_name, schema, resource_map): key constraints. :return: `dict` suitable for instantiating a `csvw.metadata.Schema` object. """ - res = dict( - columns=[convert_column_spec(f) for f in schema['fields']], - ) + res = {'columns': [convert_column_spec(f) for f in schema['fields']]} for prop in [ ('missingValues', 'null'), 'primaryKey', @@ -152,7 +159,10 @@ def convert_dialect(rsc): return res -class DataPackage: +class DataPackage: # pylint: disable=R0903 + """ + Metadata according to the frictionless spec. + """ def __init__(self, spec, directory=None): if isinstance(spec, DataPackage): self.json = spec.json @@ -170,10 +180,10 @@ def __init__(self, spec, directory=None): self.json = spec - def to_tablegroup(self, cls=None): - from csvw import TableGroup + def to_tablegroup(self, cls: Optional[type] = None) -> 'TableGroup': # pylint: disable=C0116 + from csvw import TableGroup # pylint: disable=C0415 - md = {'@context': "http://www.w3.org/ns/csvw"} + md: dict[str, Any] = {'@context': "http://www.w3.org/ns/csvw"} # Package metadata: md['dc:replaces'] = json.dumps(self.json) @@ -211,14 +221,14 @@ def to_tablegroup(self, cls=None): rsc.get('format') == 'csv': # Table Schema: md.setdefault('tables', []) - table = dict( - url=rsc['path'], - tableSchema=convert_table_schema(rsc.get('name'), schema, resource_map), - dialect=convert_dialect(rsc), - ) + table = { + 'url': rsc['path'], + 'tableSchema': convert_table_schema(rsc.get('name'), schema, resource_map), + 'dialect': convert_dialect(rsc), + } md['tables'].append(table) cls = cls or TableGroup res = cls.fromvalue(md) - res._fname = self.dir / 'csvw-metadata.json' + res._fname = self.dir / 'csvw-metadata.json' # pylint: disable=W0212 return res diff --git a/src/csvw/metadata.py b/src/csvw/metadata.py index 9e296a1..1eb07ae 100644 --- a/src/csvw/metadata.py +++ b/src/csvw/metadata.py @@ -54,7 +54,7 @@ T = TypeVar('T') -class Invalid: +class Invalid: # pylint: disable=R0903,C0115: pass @@ -86,7 +86,7 @@ class Dialect(BaseDialect): class URITemplate(uritemplate.URITemplate): - + """URITemplate properties support expansion, given suitable context.""" def __eq__(self, other): if isinstance(other, str): return self.uri == other @@ -94,11 +94,11 @@ def __eq__(self, other): return False return super().__eq__(other) - def asdict(self, **_): + def asdict(self, **_): # pylint: disable=C0116 return f'{self}' -def convert_uri_template(v): +def convert_uri_template(v): # pylint: disable=C0116 if v is None: return None # pragma: no cover if not isinstance(v, str): @@ -119,7 +119,7 @@ def __init__(self, string: Union[str, pathlib.Path]): self.string = string @classmethod - def from_value(cls, v: Union['Link', str, pathlib.Path]): + def from_value(cls, v: Union['Link', str, pathlib.Path]): # pylint: disable=C0116 if isinstance(v, Link): return v # pragma: no cover return cls(v) @@ -233,7 +233,7 @@ class Datatype(DescriptionBase): # pylint: disable=too-many-instance-attributes def __post_init__(self): self.base = functools.partial( - utils.converter, + utils.type_checker, str, 'string', allow_none=True, @@ -305,7 +305,7 @@ def error_if(msg, *conditions): def _set_constraints(self): for att in ('length', 'maxLength', 'minLength'): - setattr(self, att, utils.optional(int)(getattr(self, att))) + setattr(self, att, utils.optcast(int)(getattr(self, att))) for attr_ in [ 'minimum', 'maximum', 'minInclusive', 'maxInclusive', 'minExclusive', 'maxExclusive' ]: @@ -332,7 +332,8 @@ def fromvalue(cls, d: Union[str, dict, 'Datatype']) -> 'Datatype': raise ValueError(d) - def asdict(self, omit_defaults=True): + def asdict(self, omit_defaults=True) -> dict: + """The datatype serialized as dict suitable for conversion to JSON.""" res = DescriptionBase.asdict(self, omit_defaults=omit_defaults) for attr_ in [ 'minimum', 'maximum', 'minInclusive', 'maxInclusive', 'minExclusive', 'maxExclusive' @@ -344,11 +345,11 @@ def asdict(self, omit_defaults=True): return res @property - def basetype(self) -> type: + def basetype(self) -> type: # pylint: disable=C0116 return DATATYPES[self.base] @property - def derived_description(self) -> dict: + def derived_description(self) -> dict: # pylint: disable=C0116 return self.basetype.derived_description(self) def formatted(self, v: Any) -> str: @@ -424,7 +425,7 @@ class Description(DescriptionBase): # pylint: disable=R0902 def __post_init__(self): if self.datatype is not None: self.datatype = Datatype.fromvalue(self.datatype) - self.default = utils.converter(str, "", self.default, allow_list=False) + self.default = utils.type_checker(str, "", self.default, allow_list=False) if not tags.check(self.lang): warnings.warn('Invalid language tag') self.lang = 'und' @@ -434,9 +435,9 @@ def __post_init__(self): if not all(isinstance(vv, str) for vv in self.null): warnings.warn('Invalid null property') self.null = [""] - self.ordered = utils.converter(bool, False, self.ordered, allow_none=True) - self.separator = utils.converter(str, None, self.separator, allow_none=True) - self.textDirection = utils.converter( + self.ordered = utils.type_checker(bool, False, self.ordered, allow_none=True) + self.separator = utils.type_checker(str, None, self.separator, allow_none=True) + self.textDirection = utils.type_checker( str, None, self.textDirection, @@ -487,8 +488,8 @@ class Column(Description): def __post_init__(self): super().__post_init__() - self.name = utils.converter(str, None, self.name, allow_none=True) - self.suppressOutput = utils.converter(bool, False, self.suppressOutput) + self.name = utils.type_checker(str, None, self.name, allow_none=True) + self.suppressOutput = utils.type_checker(bool, False, self.suppressOutput) if self.titles is not None: try: @@ -497,7 +498,7 @@ def __post_init__(self): warnings.warn('Invalid titles property') self.titles = None - self.virtual = utils.converter(bool, False, self.virtual) + self.virtual = utils.type_checker(bool, False, self.virtual) def __str__(self): return self.name or (self.titles and self.titles.getfirst()) or f'_col.{self._number}' @@ -649,14 +650,14 @@ def __post_init__(self): super().__post_init__() self.columns = [ Column.fromvalue(c) for c in - utils.converter(dict, None, utils.converter(list, [], self.columns))] + utils.type_checker(dict, None, utils.type_checker(list, [], self.columns))] for i, col in enumerate(self.columns): col._number = i + 1 # pylint: disable=protected-access if self.foreignKeys is None: self.foreignKeys = [] else: res = [] - for d in utils.converter(dict, None, self.foreignKeys): + for d in utils.type_checker(dict, None, self.foreignKeys): try: res.append(ForeignKey.fromdict(d)) except TypeError: @@ -770,7 +771,7 @@ def __post_init__(self): elif self.dialect is not None: self.dialect = Dialect(**dialect_props(self.dialect)) - self.tableDirection = utils.converter( + self.tableDirection = utils.type_checker( str, 'auto', self.tableDirection, cond=lambda s: s in ['rtl', 'ltr', 'auto']) self.tableSchema = Schema.fromvalue(self.tableSchema) @@ -823,7 +824,7 @@ def to_file(self, fname: Union[str, pathlib.Path], omit_defaults=True) -> pathli description objects. If `omit_defaults==True`, these properties will be pruned from \ the JSON object. """ - fname = utils.ensure_path(fname) + fname = pathlib.Path(fname) data = self.asdict(omit_defaults=omit_defaults) with utils.json_open(str(fname), 'w') as f: json.dump(data, f, indent=4, separators=(',', ': ')) @@ -1037,6 +1038,11 @@ def write(self, def check_primary_key(self, log=None, items=None) -> bool: """Make sure primary keys are unique.""" + # We want to silence error logging when reading table rows, because we are not interested + # in conversion errors here. + nolog = logging.getLogger(__name__) + nolog.addHandler(logging.NullHandler()) + success = True if items is not None: warnings.warn('the items argument of check_primary_key ' @@ -1045,7 +1051,7 @@ def check_primary_key(self, log=None, items=None) -> bool: get_pk = operator.itemgetter(*self.tableSchema.primaryKey) seen = set() # Read all rows in the table, ignoring errors: - for fname, lineno, row in self.iterdicts(log=utils.nolog(), with_metadata=True): + for fname, lineno, row in self.iterdicts(log=nolog, with_metadata=True): pk = get_pk(row) if pk in seen: utils.log_or_raise(f'{fname}:{lineno} duplicate primary key: {pk}', log=log) diff --git a/src/csvw/metadata_utils.py b/src/csvw/metadata_utils.py index bb9534a..effda0d 100644 --- a/src/csvw/metadata_utils.py +++ b/src/csvw/metadata_utils.py @@ -1,16 +1,23 @@ """ Helpers to model CSVW metadata as dataclasses. """ +import re +import copy +import html +import json import decimal import warnings import collections from collections.abc import Generator import dataclasses -from typing import Any, Optional, Union +from typing import Any, Optional, Union, TYPE_CHECKING from language_tags import tags -from .utils import is_url +from .utils import is_url, slug + +if TYPE_CHECKING: + from csvw.metadata import TableGroup __all__ = ['valid_common_property', 'valid_id_property', 'valid_context_property', 'DescriptionBase', 'dataclass_asdict', 'NAMESPACES', 'dialect_props'] @@ -170,7 +177,11 @@ def valid_id_property(v: str) -> Optional[str]: return v -def valid_context_property(ctx): +def valid_context_property(ctx: Union[None, str, list]) -> Union[None, str, list]: + """ + Make sure the requirements for @context objects in CSVW are met. + If not, warn or raise exceptions accordingly. + """ nsurl = NAMESPACES['csvw'].replace('#', '') if ctx is None: return ctx @@ -333,3 +344,129 @@ def dialect_props(d: dict[str, Any]) -> dict: if partitioned.get('headerRowCount'): partitioned['header'] = True return partitioned + + +def qname2url(qname: str) -> Optional[str]: + """Turn a qname into an http URL by replacing the prefix with the associated URL.""" + for prefix, uri in NAMESPACES.items(): + if qname.startswith(prefix + ':'): + return qname.replace(prefix + ':', uri) + return None + + +def metadata2markdown(tg: 'TableGroup', link_files: bool = False) -> str: + """ + Render the metadata of a dataset as markdown. + + :param link_files: If True, links to data files will be added, assuming the markdown is stored \ + in the same directory as the metadata file. + :return: `str` with markdown formatted text + """ + fname = tg._fname # pylint: disable=W0212 + res = [f"# {tg.common_props.get('dc:title', 'Dataset')}\n"] + if fname and link_files: + res.append(f'> [!NOTE]\n> Described by [{fname.name}]({fname.name}).\n') + + res.append(_properties({k: v for k, v in tg.common_props.items() if k != 'dc:title'})) + + for table in tg.tables: + res.extend(list(_iter_table2markdown(tg, table, link_files))) + return '\n'.join(res) + + +def _qname2link(qname, html=False): # pylint: disable=W0621 + url = qname2url(qname) + if url: + if html: + return f'{qname}' + return f'[{qname}]({url})' + return qname + + +def _htmlify(obj, key=None): + """ + For inclusion in tables we must use HTML for lists. + """ + if isinstance(obj, list): + lis = ''.join(f'
  • {_htmlify(item, key=key)}
  • ' for item in obj) + return f'
      {lis}
    ' + if isinstance(obj, dict): + items = [] + for k, v in obj.items(): + items.append(f'
    {_qname2link(k, html=True)}
    {html.escape(str(v))}
    ') + return f"
    {''.join(items)}
    " + return str(obj) + + +def _properties(props): + def _img(img: Union[str, dict]): + if isinstance(img, str): # pragma: no cover + img = {'https://schema.org/contentUrl': img} + return (f"![{img.get('https://schema.org/caption') or ''}]" + f"({img.get('https://schema.org/contentUrl')})\n") + + props = {k: v for k, v in copy.deepcopy(props).items() if v} + res = [] + desc = props.pop('dc:description', None) + if desc: + res.append(desc + '\n') + img = props.pop('https://schema.org/image', None) + if img: + res.append(_img(img)) + if props: + res.append('property | value\n --- | ---') + for k, v in props.items(): + res.append(f'{_qname2link(k)} | {_htmlify(v, key=k)}') + return '\n'.join(res) + '\n' + + +def _iter_table2markdown(tg, table, link_files): + fks = { + fk.columnReference[0]: (fk.reference.columnReference[0], fk.reference.resource.string) + for fk in table.tableSchema.foreignKeys if len(fk.columnReference) == 1} + header = f'## Table ' + fname = tg._fname # pylint: disable=W0212 + if (link_files and fname and fname.parent.joinpath(table.url.string).exists()): + header += f'[{table.url.string}]({table.url.string})\n' + else: # pragma: no cover + header += table.url.string + yield '\n' + header + '\n' + yield _properties(table.common_props) + dialect = table.inherit('dialect') + if dialect.asdict(): + yield f'\n**CSV dialect**: `{json.dumps(dialect.asdict())}`\n' + yield '\n### Columns\n' + yield 'Name/Property | Datatype | Description' + yield ' --- | --- | --- ' + for col in table.tableSchema.columns: + yield _colrow(col, fks, table.tableSchema.primaryKey) + + +def _colrow(col, fks, pk): + dt = f"`{col.datatype.base if col.datatype else 'string'}`" + if col.datatype: + if col.datatype.format: + if re.fullmatch(r'[\w\s]+(\|[\w\s]+)*', col.datatype.format): + dt += '
    Valid choices:
    ' + dt += ''.join(f' `{w}`' for w in col.datatype.format.split('|')) + elif col.datatype.base == 'string': + dt += f'
    Regex: `{col.datatype.format}`' + if col.datatype.minimum: + dt += f'
    ≥ {col.datatype.minimum}' + if col.datatype.maximum: + dt += f'
    ≤ {col.datatype.maximum}' + if col.separator: + dt = f'list of {dt} (separated by `{col.separator}`)' + desc = col.common_props.get('dc:description', '').replace('\n', ' ') + + if pk and col.name in pk: + desc = (desc + '
    ') if desc else desc + desc += 'Primary key' + + if col.name in fks: + desc = (desc + '
    ') if desc else desc + cname, tname = fks[col.name] + desc += f'References [{tname}::{cname}](#table-{slug(tname)})' + + return ' | '.join([ + f'[{col.name}]({col.propertyUrl})' if col.propertyUrl else f'`{col.name}`', dt, desc]) diff --git a/src/csvw/utils.py b/src/csvw/utils.py index 3228e14..f9a579a 100644 --- a/src/csvw/utils.py +++ b/src/csvw/utils.py @@ -1,16 +1,15 @@ +""" +Misc +""" import io -import logging import re -import copy -import html import json import string import keyword -import pathlib +import logging import warnings import collections import unicodedata -from types import MethodType from typing import Callable, Any, Union, Optional import requests @@ -21,27 +20,24 @@ def log_or_raise( log: Optional[logging.Logger] = None, level: str = 'warning', exception_cls: type = ValueError): + """ + Helper for error handling. In an inspection scenario, we want to list - i.e. log - all + errors. In a validation scenario, we raise an exception at the first error. + """ if log: getattr(log, level)(msg) else: raise exception_cls(msg) -def nolog(level='warning'): - class Log: - pass - - log = Log() - setattr(log, level, MethodType(lambda *args, **kw: None, log)) - return log - - def json_open(filename, mode='r', encoding='utf-8'): + """Open a text file suitable for reading JSON content, i.e. assuming it is utf-8 encoded.""" assert encoding == 'utf-8' return io.open(filename, mode, encoding=encoding) def get_json(fname) -> Union[list, dict]: + """Retrieve JSON content from a local file or remote URL.""" fname = str(fname) if is_url(fname): return requests.get(fname, timeout=10).json(object_pairs_hook=collections.OrderedDict) @@ -49,31 +45,36 @@ def get_json(fname) -> Union[list, dict]: return json.load(f, object_pairs_hook=collections.OrderedDict) -def optional(type_: type) -> Callable[[Any], Any]: +def optcast(type_: type) -> Callable[[Any], Any]: + """Returns a callable that casts its argument to type_ unless it is None.""" return lambda v: v if v is None else type_(v) -def is_url(s): +def is_url(s): # pylint: disable=C0116 return re.match(r'https?://', str(s)) -def converter(type_, default, s, allow_none=False, cond=None, allow_list=True): - if allow_list and type_ != list and isinstance(s, list): - return [v for v in [converter(type_, None, ss, cond=cond) for ss in s] if v is not None] - - if allow_none and s is None: - return s - if not isinstance(s, type_) or (type_ == int and isinstance(s, bool)) or (cond and not cond(s)): - warnings.warn('Invalid value for property: {}'.format(s)) +def type_checker( # pylint: disable=R0913,R0917 + type_: type, + default: Optional[Any], + v: Union[list[Any], Any], + allow_none: bool = False, + cond: Optional[Callable[[Any], bool]] = None, + allow_list=True, +) -> Any: + """Check if a value has a certain type (with bells and whistles), warn if not.""" + if allow_list and type_ != list and isinstance(v, list): + # Convert a list of strings by applying the conversion to each not-None item. + return [v for v in [type_checker(type_, None, vv, cond=cond) for vv in v] if v is not None] + + if allow_none and v is None: + return v + + # Note: `bool` is a `subclass` of int in Python! + if not isinstance(v, type_) or (type_ == int and isinstance(v, bool)) or (cond and not cond(v)): + warnings.warn(f'Invalid value for property: {v}') return default - return s - - -def ensure_path(fname): - if not isinstance(fname, pathlib.Path): - assert isinstance(fname, str) - return pathlib.Path(fname) - return fname + return v def normalize_name(s): @@ -121,123 +122,3 @@ def slug(s, remove_whitespace=True, lowercase=True): res = res.encode('ascii', 'ignore').decode('ascii') assert re.match('[ A-Za-z0-9]*$', res) return res - - -def qname2url(qname): - for prefix, uri in { - 'csvw': 'http://www.w3.org/ns/csvw#', - 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', - 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', - 'xsd': 'http://www.w3.org/2001/XMLSchema#', - 'dc': 'http://purl.org/dc/terms/', - 'dcat': 'http://www.w3.org/ns/dcat#', - 'prov': 'http://www.w3.org/ns/prov#', - }.items(): - if qname.startswith(prefix + ':'): - return qname.replace(prefix + ':', uri) - - -def metadata2markdown(tg, link_files=False) -> str: - """ - Render the metadata of a dataset as markdown. - - :param link_files: If True, links to data files will be added, assuming the markdown is stored \ - in the same directory as the metadata file. - :return: `str` with markdown formatted text - """ - def qname2link(qname, html=False): - url = qname2url(qname) - if url: - if html: - return f'{qname}' - return f'[{qname}]({url})' - return qname - - def htmlify(obj, key=None): - """ - For inclusion in tables we must use HTML for lists. - """ - if isinstance(obj, list): - lis = ''.join(f'
  • {htmlify(item, key=key)}
  • ' for item in obj) - return f'
      {lis}
    ' - if isinstance(obj, dict): - items = [] - for k, v in obj.items(): - items.append(f'
    {qname2link(k, html=True)}
    {html.escape(str(v))}
    ') - return f"
    {''.join(items)}
    " - return str(obj) - - def properties(props): - props = {k: v for k, v in copy.deepcopy(props).items() if v} - res = [] - desc = props.pop('dc:description', None) - if desc: - res.append(desc + '\n') - img = props.pop('https://schema.org/image', None) - if img: - if isinstance(img, str): # pragma: no cover - img = {'contentUrl': img} - res.append('![{}]({})\n'.format( - img.get('https://schema.org/caption') or '', - img.get('https://schema.org/contentUrl'))) - if props: - res.append('property | value\n --- | ---') - for k, v in props.items(): - res.append('{} | {}'.format(qname2link(k), htmlify(v, key=k))) - return '\n'.join(res) + '\n' - - def colrow(col, fks, pk): - dt = '`{}`'.format(col.datatype.base if col.datatype else 'string') - if col.datatype: - if col.datatype.format: - if re.fullmatch(r'[\w\s]+(\|[\w\s]+)*', col.datatype.format): - dt += '
    Valid choices:
    ' - dt += ''.join(f' `{w}`' for w in col.datatype.format.split('|')) - elif col.datatype.base == 'string': - dt += f'
    Regex: `{col.datatype.format}`' - if col.datatype.minimum: - dt += f'
    ≥ {col.datatype.minimum}' - if col.datatype.maximum: - dt += f'
    ≤ {col.datatype.maximum}' - if col.separator: - dt = f'list of {dt} (separated by `{col.separator}`)' - desc = col.common_props.get('dc:description', '').replace('\n', ' ') - - if pk and col.name in pk: - desc = (desc + '
    ') if desc else desc - desc += 'Primary key' - - if col.name in fks: - desc = (desc + '
    ') if desc else desc - desc += 'References [{}::{}](#table-{})'.format( - fks[col.name][1], fks[col.name][0], slug(fks[col.name][1])) - - return ' | '.join([ - f'[{col.name}]({col.propertyUrl})' if col.propertyUrl else f'`{col.name}`', dt, desc]) - - res = ['# {}\n'.format(tg.common_props.get('dc:title', 'Dataset'))] - if tg._fname and link_files: - res.append(f'> [!NOTE]\n> Described by [{tg._fname.name}]({tg._fname.name}).\n') - - res.append(properties({k: v for k, v in tg.common_props.items() if k != 'dc:title'})) - - for table in tg.tables: - fks = { - fk.columnReference[0]: (fk.reference.columnReference[0], fk.reference.resource.string) - for fk in table.tableSchema.foreignKeys if len(fk.columnReference) == 1} - header = f'## Table ' - if link_files and tg._fname and tg._fname.parent.joinpath(table.url.string).exists(): - header += f'[{table.url.string}]({table.url.string})\n' - else: # pragma: no cover - header += table.url.string - res.append('\n' + header + '\n') - res.append(properties(table.common_props)) - dialect = table.inherit('dialect') - if dialect.asdict(): - res.append(f'\n**CSV dialect**: `{json.dumps(dialect.asdict())}`\n') - res.append('\n### Columns\n') - res.append('Name/Property | Datatype | Description') - res.append(' --- | --- | --- ') - for col in table.tableSchema.columns: - res.append(colrow(col, fks, table.tableSchema.primaryKey)) - return '\n'.join(res) diff --git a/tests/test_utils.py b/tests/test_utils.py index d5b25f0..ef23876 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -3,10 +3,6 @@ from csvw import utils -def test_ensure_path(): - assert isinstance(utils.ensure_path('test.csv'), pathlib.Path) - - def test_normalize_name(): assert utils.normalize_name('') == '_' assert utils.normalize_name('0') == '_0' From 7139740bb07ec5b9de48bec7f0564cc5fac98fde Mon Sep 17 00:00:00 2001 From: Robert Forkel Date: Fri, 6 Mar 2026 10:52:02 +0100 Subject: [PATCH 11/17] shaved off another dep: requests --- setup.cfg | 2 - src/csvw/dsv_dialects.py | 24 +++++---- src/csvw/metadata.py | 26 ++++----- src/csvw/utils.py | 83 ++++++++++++++++++++++++++++- tests/conftest.py | 107 ++++++++++++++++++++------------------ tests/test_conformance.py | 12 ++--- tests/test_metadata.py | 46 +++++++--------- 7 files changed, 188 insertions(+), 112 deletions(-) diff --git a/setup.cfg b/setup.cfg index 3a61226..425d1d8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -46,7 +46,6 @@ install_requires = rfc3986<2 uritemplate>=3.0.0 babel - requests language-tags rdflib termcolor @@ -75,7 +74,6 @@ test = frictionless pytest>=5 pytest-mock - requests-mock pytest-cov docs = sphinx<7 diff --git a/src/csvw/dsv_dialects.py b/src/csvw/dsv_dialects.py index 288d8c0..76a17a2 100644 --- a/src/csvw/dsv_dialects.py +++ b/src/csvw/dsv_dialects.py @@ -25,17 +25,18 @@ def convert_encoding(s): + """We want to force utf-8 encoding, but accept diverse ways of specifying this :).""" s = utils.type_checker(str, 'utf-8', s) try: _ = 'x'.encode(ENCODING_MAP.get(s, s)) return s except LookupError: - warnings.warn('Invalid value for property: {}'.format(s)) + warnings.warn(f'Invalid value for property: {s}') return 'utf-8' @dataclasses.dataclass -class Dialect: +class Dialect: # pylint: disable=too-many-instance-attributes """ A CSV dialect specification. @@ -43,17 +44,18 @@ class Dialect: """ encoding: str = 'utf-8' - lineTerminators: list[str] = dataclasses.field(default_factory=lambda: ['\r\n', '\n']) - quoteChar: str = '"' - doubleQuote: bool = True - skipRows: int = 0 - commentPrefix: str = '#' + lineTerminators: list[str] = dataclasses.field( # pylint: disable=invalid-name + default_factory=lambda: ['\r\n', '\n']) + quoteChar: str = '"' # pylint: disable=invalid-name + doubleQuote: bool = True # pylint: disable=invalid-name + skipRows: int = 0 # pylint: disable=invalid-name + commentPrefix: str = '#' # pylint: disable=invalid-name header: bool = True - headerRowCount: int = 1 + headerRowCount: int = 1 # pylint: disable=invalid-name delimiter: str = ',' - skipColumns: int = 0 - skipBlankRows: bool = False - skipInitialSpace: bool = False + skipColumns: int = 0 # pylint: disable=invalid-name + skipBlankRows: bool = False # pylint: disable=invalid-name + skipInitialSpace: bool = False # pylint: disable=invalid-name trim: Literal['true', 'false', 'start', 'end'] = 'false' def __post_init__(self): diff --git a/src/csvw/metadata.py b/src/csvw/metadata.py index 1eb07ae..5a9ae16 100644 --- a/src/csvw/metadata.py +++ b/src/csvw/metadata.py @@ -27,7 +27,6 @@ from urllib.parse import urljoin, urlparse, urlunparse from language_tags import tags -import requests import uritemplate from . import utils @@ -701,7 +700,7 @@ def fromvalue(cls, d: Union[dict, str]) -> 'Schema': if isinstance(d, str): try: # The schema is referenced with a URL - d = requests.get(d, timeout=10).json() + d = utils.request_get(d).json() except: # pragma: no cover # noqa: E722 # pylint: disable=W0702 return d if not isinstance(d, dict): @@ -1066,7 +1065,7 @@ def __iter__(self): def _get_csv_reader(self, fname, dialect, stack) -> UnicodeReaderWithLineNumber: if is_url(fname): handle = io.TextIOWrapper( - io.BytesIO(requests.get(str(fname), timeout=10).content), encoding=dialect.encoding) + io.BytesIO(utils.request_get(str(fname)).content), encoding=dialect.encoding) else: handle = fname fpath = pathlib.Path(fname) @@ -1609,23 +1608,24 @@ def describes(md, url): if url and is_url(url): # §5.2 Link Header # https://w3c.github.io/csvw/syntax/#link-header - res = requests.head(url, timeout=10) - no_header = bool(re.search(r'header\s*=\s*absent', res.headers.get('content-type', ''))) - desc = res.links.get('describedby') - if desc and desc['type'] in [ - "application/csvm+json", "application/ld+json", "application/json"]: - md = utils.get_json(Link(desc['url']).resolve(url)) - if describes(md, url): - return md, no_header + content_type, links = utils.request_head(url) + no_header = bool(re.search(r'header\s*=\s*absent', content_type)) + for link in links: + if link.params.get('rel') == 'describedby': + if link.params.get('type') in [ + "application/csvm+json", "application/ld+json", "application/json"]: + md = utils.get_json(Link(link.url).resolve(url)) + if describes(md, url): + return md, no_header warnings.warn('Ignoring linked metadata because it does not reference the data') # §5.3 Default Locations and Site-wide Location Configuration # https://w3c.github.io/csvw/syntax/ # #default-locations-and-site-wide-location-configuration - res = requests.get(Link('/.well-known/csvm').resolve(url), timeout=10) + res = utils.request_get(Link('/.well-known/csvm').resolve(url)) locs = res.text if res.status_code == 200 else '{+url}-metadata.json\ncsv-metadata.json' for line in locs.split('\n'): - res = requests.get(Link(URITemplate(line).expand(url=url)).resolve(url), timeout=10) + res = utils.request_get(Link(URITemplate(line).expand(url=url)).resolve(url)) if res.status_code == 200: try: md = res.json() diff --git a/src/csvw/utils.py b/src/csvw/utils.py index f9a579a..da8f91e 100644 --- a/src/csvw/utils.py +++ b/src/csvw/utils.py @@ -9,10 +9,89 @@ import logging import warnings import collections +import dataclasses import unicodedata +import urllib.request from typing import Callable, Any, Union, Optional -import requests +HTTP_REQUEST_TIMEOUT = 10 + + +@dataclasses.dataclass +class LinkHeader: + """ + https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Headers/Link + """ + url: str + params: dict[str, str] + + @classmethod + def from_string(cls, s): + """ + ; param1=value1; param2="value2" + """ + comps = re.split(r'>\s*;\s*', s.strip(), maxsplit=1) + if len(comps) == 2: + url, sparams = comps + else: + url, sparams = comps[0], '' + assert url.startswith('<') + url = url[1:].strip() + params = {} + for sparam in sparams.split(';'): + key, _, value = sparam.strip().partition('=') + key, value = key.strip(), (value or '').strip() + if value.startswith('"'): + assert value.endswith('"') + value = value[1:-1].strip() + params[key] = value or None + return cls(url=url, params=params) + + @classmethod + def iter_links(cls, s): + """ + A Link header might contain multiple links separated by comma. + """ + for i, single in enumerate(re.split(r',\s*<', s)): + yield cls.from_string(single if i == 0 else '<' + single) + + +def request_head(url) -> tuple[str, list[LinkHeader]]: + """Makes a HEAD request and returns the relevant response data.""" + req = urllib.request.Request(url, method='HEAD') + with urllib.request.urlopen(req, timeout=HTTP_REQUEST_TIMEOUT) as response: + links = [] + for mult in response.info().get_all('Link') or []: + links.extend(LinkHeader.iter_links(mult)) + return response.info().get_content_type() or '', links + + +@dataclasses.dataclass +class GetResponse: + status_code: int = 200 + content: bytes = None + text: str = None + + def __post_init__(self): + if self.content and not self.text: + self.text = self.content.decode('utf8') + if self.text and not self.content: + self.content = self.text.encode('utf8') + + @classmethod + def from_response(cls, response): + content = response.read() + text = content.decode(response.headers.get_content_charset() or 'utf-8') + return cls(status_code=response.status, content=content, text=text) + + def json(self): + return json.loads(self.text, object_pairs_hook=collections.OrderedDict) + + +def request_get(url: str) -> GetResponse: + """Makes a GET request.""" + with urllib.request.urlopen(url, timeout=HTTP_REQUEST_TIMEOUT) as response: + return GetResponse.from_response(response) def log_or_raise( @@ -40,7 +119,7 @@ def get_json(fname) -> Union[list, dict]: """Retrieve JSON content from a local file or remote URL.""" fname = str(fname) if is_url(fname): - return requests.get(fname, timeout=10).json(object_pairs_hook=collections.OrderedDict) + return request_get(fname).json() with json_open(fname) as f: return json.load(f, object_pairs_hook=collections.OrderedDict) diff --git a/tests/conftest.py b/tests/conftest.py index 7f32cc7..5770d02 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,25 +2,26 @@ import pathlib import warnings import contextlib +import dataclasses import urllib.parse import urllib.request +from typing import Optional, Literal import pytest -import attr from csvw.metadata import CSVW -from csvw.utils import get_json +from csvw.utils import get_json, LinkHeader, GetResponse def pytest_addoption(parser): parser.addoption("--number", type=int, help="csvw json test number", default=None) -def csvw_tests_url(path): - return 'http://www.w3.org/2013/csvw/tests/{}'.format(path) +def csvw_tests_url(path) -> str: + return f'http://www.w3.org/2013/csvw/tests/{path}' -def csvw_tests_path(path): +def csvw_tests_path(path) -> pathlib.Path: return pathlib.Path(__file__).parent / 'fixtures' / 'csvw' / 'tests' / path @@ -32,27 +33,32 @@ def unorder(o): return o -@attr.s +@dataclasses.dataclass class CSWVTest: - id = attr.ib(converter=lambda s: s.split('#')[-1]) - type = attr.ib(validator=attr.validators.in_([ + id: str + type: Literal[ 'csvt:NegativeJsonTest', 'csvt:ToJsonTest', 'csvt:ToJsonTestWithWarnings', 'csvt:PositiveValidationTest', 'csvt:NegativeValidationTest', - 'csvt:WarningValidationTest', - ])) - name = attr.ib() - comment = attr.ib() - approval = attr.ib() - option = attr.ib( - converter=lambda d: {k: csvw_tests_url(v) if k == 'metadata' else v for k, v in d.items()}) - action = attr.ib(converter=lambda s: csvw_tests_url(s)) - result = attr.ib(converter=lambda s: csvw_tests_url(s) if s else None, default=None) - implicit = attr.ib(default=None) - httpLink = attr.ib(default=None) - contentType = attr.ib(default=None) + 'csvt:WarningValidationTest'] + name: str + comment: str + approval: str + option: dict + action: str + result: Optional[str] = None + implicit: str = None + httpLink: str = None + contentType: str = None + + def __post_init__(self): + self.id = self.id.split('#')[-1] + self.option = { + k: csvw_tests_url(v) if k == 'metadata' else v for k, v in self.option.items()} + self.action = csvw_tests_url(self.action) + self.result = csvw_tests_url(self.result) if self.result else None @property def is_json_test(self): @@ -66,6 +72,31 @@ def is_validation_test(self): def number(self): # pragma: no cover return int(self.id.replace('test', '')) + def request_head(self, _): + if self.contentType: + return self.contentType, [] + if self.httpLink: + return '', [LinkHeader.from_string(self.httpLink)] + return '', [] + + @staticmethod + def request_get(url): + url = urllib.parse.urlparse(url) + if url.netloc == 'www.w3.org': + if url.path.startswith('/2013/csvw/tests/'): + p = csvw_tests_path(url.path.replace('/2013/csvw/tests/', '')) + if p.exists(): + return GetResponse(content=p.read_bytes()) + elif url.path == '/.well-known/csvm': + return GetResponse( + text="""{+url}-metadata.json + csv-metadata.json + {+url}.json + csvm.json + """) + return GetResponse(status_code=404) + raise ValueError(url) # pragma: no cover + def _run(self): with contextlib.ExitStack() as stack: if self.type == "csvt:ToJsonTestWithWarnings": @@ -107,37 +138,11 @@ def _run(self): unorder(get_json(self.result)), \ '{}: {}'.format(self.id, self.name) - def run(self): - import requests_mock - - def text_callback(request, context): - url = urllib.parse.urlparse(request.url) - if url.netloc == 'www.w3.org': - if url.path.startswith('/2013/csvw/tests/'): - p = csvw_tests_path(url.path.replace('/2013/csvw/tests/', '')) - if p.exists(): - context.status_code = 200 - return p.read_text(encoding='utf8') - elif url.path == '/.well-known/csvm': - context.status_code = 200 - return """{+url}-metadata.json -csv-metadata.json -{+url}.json -csvm.json -""" - context.status_code = 404 - return '' - raise ValueError(request.url) # pragma: no cover - - with requests_mock.Mocker() as mock: - if self.contentType: - mock.head(self.action, text='', headers={'Content-Type': self.contentType}) - elif self.httpLink: - mock.head(self.action, text='', headers={'Link': self.httpLink}) - else: - mock.head(self.action, text='', headers={}) - mock.get(requests_mock.ANY, text=text_callback) - self._run() + def run(self, mocker): + mocker.patch('csvw.metadata.utils.request_head', self.request_head) + mocker.patch('csvw.metadata.utils.request_get', self.request_get) + mocker.patch('csvw.utils.request_get', self.request_get) + self._run() def pytest_generate_tests(metafunc): diff --git a/tests/test_conformance.py b/tests/test_conformance.py index b72833b..e96231f 100644 --- a/tests/test_conformance.py +++ b/tests/test_conformance.py @@ -6,18 +6,18 @@ @pytest.mark.conformance -def test_csvw_json(csvwjsontest): - csvwjsontest.run() +def test_csvw_json(csvwjsontest, mocker): + csvwjsontest.run(mocker) @pytest.mark.conformance -def test_csvw_nonnorm(csvwnonnormtest): - csvwnonnormtest.run() +def test_csvw_nonnorm(csvwnonnormtest, mocker): + csvwnonnormtest.run(mocker) @pytest.mark.conformance -def test_csvw_validation(csvwvalidationtest): - csvwvalidationtest.run() +def test_csvw_validation(csvwvalidationtest, mocker): + csvwvalidationtest.run(mocker) def test_prefix_in_property_url(): diff --git a/tests/test_metadata.py b/tests/test_metadata.py index bc25fd9..9cce7a4 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -14,6 +14,7 @@ import csvw from csvw.dsv import Dialect +from csvw.utils import GetResponse FIXTURES = pathlib.Path(__file__).parent / 'fixtures' @@ -759,22 +760,20 @@ def test_foreignkeys_2(self, tmp_path): with pytest.raises(ValueError): tg.check_referential_integrity() - def test_remote_schema(self, tmp_path): - import requests_mock - - with requests_mock.Mocker() as m: - schema = """ - {"columns": [ - {"name": "countryCode", "datatype": "string"}, - {"name": "name", "datatype": "string"}]} - """ - m.get("http://example.com/schema", content=schema.encode('utf8')) - tg = self._make_tablegroup( - tmp_path, - metadata="""{ + def test_remote_schema(self, tmp_path, mocker): + def request_get(url): + return GetResponse(text="""\ +{"columns": [ + {"name": "countryCode", "datatype": "string"}, + {"name": "name", "datatype": "string"}]}""") + + mocker.patch('csvw.metadata.utils.request_get', request_get) + tg = self._make_tablegroup( + tmp_path, + metadata="""{ "@context": "http://www.w3.org/ns/csvw", "tables": [{"url": "countries.csv", "tableSchema": "http://example.com/schema"}]}""") - assert len(tg.tables[0].tableSchema.columns) == 2 + assert len(tg.tables[0].tableSchema.columns) == 2 # The remote content has been inlined: out = tmp_path / 'md.json' @@ -824,20 +823,13 @@ def test_zip_support(tmp_path): assert len(list(csvw.TableGroup.from_file(out.parent / 'md.json').tables[0])) == 4 -def test_from_url(): - import requests_mock +def test_from_url(mocker): + def request_get(url): + return GetResponse(content=FIXTURES.joinpath(url.split('/')[-1]).read_bytes()) - def content(req, ctx): - ctx.status_code = 200 - return FIXTURES.joinpath(req.url.split('/')[-1]).read_bytes() - - with requests_mock.Mocker() as m: - m.get( - requests_mock.ANY, - content=content) - - t = csvw.Table.from_file('http://example.com/csv.txt-table-metadata.json') - assert len(list(t)) == 2 + mocker.patch('csvw.utils.request_get', request_get) + t = csvw.Table.from_file('http://example.com/csv.txt-table-metadata.json') + assert len(list(t)) == 2 def test_datatype_limits(tmp_path): From c9f38709c721e0f594ff5b765392eacb93006209 Mon Sep 17 00:00:00 2001 From: Robert Forkel Date: Fri, 6 Mar 2026 11:36:57 +0100 Subject: [PATCH 12/17] polishing --- src/csvw/datatypes.py | 5 ++-- src/csvw/db.py | 2 +- src/csvw/frictionless.py | 2 +- src/csvw/jsonld.py | 2 +- src/csvw/metadata.py | 2 +- src/csvw/metadata_utils.py | 2 +- src/csvw/utils.py | 26 ++++++++++-------- tests/test_metadata.py | 20 +++++++------- tests/test_utils.py | 55 +++++++++++++++++++++++++++++++++++++- 9 files changed, 86 insertions(+), 30 deletions(-) diff --git a/src/csvw/datatypes.py b/src/csvw/datatypes.py index ff43d7a..6cc8ca2 100644 --- a/src/csvw/datatypes.py +++ b/src/csvw/datatypes.py @@ -610,9 +610,8 @@ def repl(m): return groupChar if m.group('c') == '.': return decimalChar - return None - r = f"(?P[{re.escape((decimalChar or '') + (groupChar or ''))}])" - v = re.sub(r, repl, v) + raise ValueError(m.group('c')) # pragma: no cover + v = re.sub(r"(?P[,.])", repl, v) return v diff --git a/src/csvw/db.py b/src/csvw/db.py index 51e0704..dea0975 100644 --- a/src/csvw/db.py +++ b/src/csvw/db.py @@ -432,7 +432,7 @@ def separator(self, tname: str, cname: str) -> Optional[str]: for col in self.tdict[name].columns: if self.translate(name, col.name) == cname: return col.separator - return None + return None # pragma: no cover def split_value(self, tname: str, cname: str, value) -> Union[list[str], str, None]: """Split a value if a separator is defined for the column.""" diff --git a/src/csvw/frictionless.py b/src/csvw/frictionless.py index b0bc87d..cef86de 100644 --- a/src/csvw/frictionless.py +++ b/src/csvw/frictionless.py @@ -13,7 +13,7 @@ from typing import Any, TYPE_CHECKING, Optional if TYPE_CHECKING: - from csvw.metadata import TableGroup + from csvw.metadata import TableGroup # pragma: no cover def _convert_numeric_datatype(spec): diff --git a/src/csvw/jsonld.py b/src/csvw/jsonld.py index c9ed405..9238df8 100644 --- a/src/csvw/jsonld.py +++ b/src/csvw/jsonld.py @@ -19,7 +19,7 @@ from .utils import is_url if TYPE_CHECKING: - from .metadata import Table, Column + from .metadata import Table, Column # pragma: no cover __all__ = ['group_triples', 'to_json', 'Triple', 'format_value'] diff --git a/src/csvw/metadata.py b/src/csvw/metadata.py index 5a9ae16..dc64544 100644 --- a/src/csvw/metadata.py +++ b/src/csvw/metadata.py @@ -653,7 +653,7 @@ def __post_init__(self): for i, col in enumerate(self.columns): col._number = i + 1 # pylint: disable=protected-access if self.foreignKeys is None: - self.foreignKeys = [] + self.foreignKeys = [] # pragma: no cover else: res = [] for d in utils.type_checker(dict, None, self.foreignKeys): diff --git a/src/csvw/metadata_utils.py b/src/csvw/metadata_utils.py index effda0d..52fcc89 100644 --- a/src/csvw/metadata_utils.py +++ b/src/csvw/metadata_utils.py @@ -17,7 +17,7 @@ from .utils import is_url, slug if TYPE_CHECKING: - from csvw.metadata import TableGroup + from csvw.metadata import TableGroup # pragma: no cover __all__ = ['valid_common_property', 'valid_id_property', 'valid_context_property', 'DescriptionBase', 'dataclass_asdict', 'NAMESPACES', 'dialect_props'] diff --git a/src/csvw/utils.py b/src/csvw/utils.py index da8f91e..bcdf683 100644 --- a/src/csvw/utils.py +++ b/src/csvw/utils.py @@ -33,18 +33,20 @@ def from_string(cls, s): comps = re.split(r'>\s*;\s*', s.strip(), maxsplit=1) if len(comps) == 2: url, sparams = comps + url += '>' else: url, sparams = comps[0], '' - assert url.startswith('<') - url = url[1:].strip() + assert url.startswith('<') and url.endswith('>') + url = url[1:-1].strip() params = {} - for sparam in sparams.split(';'): - key, _, value = sparam.strip().partition('=') - key, value = key.strip(), (value or '').strip() - if value.startswith('"'): - assert value.endswith('"') - value = value[1:-1].strip() - params[key] = value or None + if sparams: + for sparam in sparams.split(';'): + key, _, value = sparam.strip().partition('=') + key, value = key.strip(), (value or '').strip() + if value.startswith('"'): + assert value.endswith('"') + value = value[1:-1].strip() + params[key] = value or None return cls(url=url, params=params) @classmethod @@ -68,6 +70,7 @@ def request_head(url) -> tuple[str, list[LinkHeader]]: @dataclasses.dataclass class GetResponse: + """Relevant data from an HTTP GET response.""" status_code: int = 200 content: bytes = None text: str = None @@ -79,12 +82,13 @@ def __post_init__(self): self.content = self.text.encode('utf8') @classmethod - def from_response(cls, response): + def from_response(cls, response) -> 'GetResponse': content = response.read() text = content.decode(response.headers.get_content_charset() or 'utf-8') return cls(status_code=response.status, content=content, text=text) - def json(self): + def json(self) -> Any: + """The content of the repsonse parsed as JSON.""" return json.loads(self.text, object_pairs_hook=collections.OrderedDict) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 9cce7a4..d924230 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -105,19 +105,19 @@ def test_doubleQuote(self, tmp_path): t.dialect.doubleQuote = False c, res = self._roundtrip(t, fpath, {"col1": "", "col2": value}) assert r'\"a\\\\b\\c\\\"d' in c - assert res[0]['col2'] == value + assert res[0]['col2'] == value # pragma: no cover - t.dialect.quoteChar = '*' - c, res = self._roundtrip(t, fpath, {"col1": "", "col2": value}) - assert res[0]['col2'] == value + t.dialect.quoteChar = '*' # pragma: no cover + c, res = self._roundtrip(t, fpath, {"col1": "", "col2": value}) # pragma: no cover + assert res[0]['col2'] == value # pragma: no cover - t.dialect.doubleQuote = True - c, res = self._roundtrip(t, fpath, {"col1": "", "col2": value}) - assert res[0]['col2'] == value + t.dialect.doubleQuote = True # pragma: no cover + c, res = self._roundtrip(t, fpath, {"col1": "", "col2": value}) # pragma: no cover + assert res[0]['col2'] == value # pragma: no cover - value = value.replace('"', '*') - c, res = self._roundtrip(t, fpath, {"col1": "", "col2": value}) - assert res[0]['col2'] == value + value = value.replace('"', '*') # pragma: no cover + c, res = self._roundtrip(t, fpath, {"col1": "", "col2": value}) # pragma: no cover + assert res[0]['col2'] == value # pragma: no cover @pytest.mark.xfail(reason='commentPrefix is checked only after csv.reader has parsed the line') def test_commentPrefix(self, tmp_path): diff --git a/tests/test_utils.py b/tests/test_utils.py index ef23876..dfb5c50 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,8 +1,61 @@ -import pathlib +import contextlib + +import pytest from csvw import utils +@pytest.mark.parametrize( + 's,check', + [ + ('', lambda lh: lh.params == {} and lh.url == 'url'), + ('; p=5', lambda lh: lh.params['p'] == '5'), + ] +) +def test_LinkHeader(s, check): + assert check(utils.LinkHeader.from_string(s)) + + +def test_LinkHeader_mult(): + res = list(utils.LinkHeader.iter_links(', ')) + assert len(res) == 2 + assert res[0].url == 'url1' and res[1].url == 'url2' + + +def test_request_get(mocker): + @contextlib.contextmanager + def urlopen(url, **_): + yield mocker.Mock( + read=lambda: '"äöü"'.encode('latin1'), + status=201, + headers=mocker.Mock(get_content_charset=lambda: 'latin1') + ) + + mocker.patch('csvw.utils.urllib.request.urlopen', urlopen) + res = utils.request_get('url') + assert res.text == '"äöü"' + assert res.json() == "äöü" + + +def test_request_head(mocker): + class HTTPMessage: + @staticmethod + def info(): + return mocker.Mock( + get_all=lambda _: ['', ''], + get_content_type=lambda: 'text/html') + + @contextlib.contextmanager + def urlopen(url, **_): + yield HTTPMessage() + + mocker.patch('csvw.utils.urllib.request.Request', mocker.Mock()) + mocker.patch('csvw.utils.urllib.request.urlopen', urlopen) + content_type, links = utils.request_head('url') + assert content_type == 'text/html' + assert len(links) == 2 + + def test_normalize_name(): assert utils.normalize_name('') == '_' assert utils.normalize_name('0') == '_0' From 4438443c6e0f323e38520cb0d3e82becac14db31 Mon Sep 17 00:00:00 2001 From: Robert Forkel Date: Fri, 6 Mar 2026 12:45:03 +0100 Subject: [PATCH 13/17] refactored http request handling --- CHANGES | 13 ++++++++++ README.md | 6 ++++- RELEASING.md | 8 +++++++ src/csvw/utils.py | 30 ++++++++++++++++++++---- tests/{test_utils.py => test_aautils.py} | 18 ++++++++++---- 5 files changed, 65 insertions(+), 10 deletions(-) rename tests/{test_utils.py => test_aautils.py} (75%) diff --git a/CHANGES b/CHANGES index 403f2eb..849d7b0 100644 --- a/CHANGES +++ b/CHANGES @@ -1,6 +1,19 @@ Changelog ========= +Unreleased +---------- + +- removed dependency on `attrs` + +Backwards incompatibility +~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Since formerly `@attr.s` decorated classes are now dataclasses, any class inheriting from these + will be broken. +- Some functions have been moved to different modules, so imports may be broken. + + Version 3.7.0 ------------- diff --git a/README.md b/README.md index e3f2bdc..18ecf03 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,10 @@ This package provides - a Python API to read and write relational, tabular data according to the [CSV on the Web](https://csvw.org/) specification and - commandline tools for reading and validating CSVW data. +> [!IMPORTANT] +> The Python API provided by `csvw` 4.x is not fully backwards compatible with `csvw` < 4. +> See [CHANGES](CHANGES) for more information. + ## Links @@ -19,7 +23,7 @@ This package provides ## Installation -This package runs under Python >=3.8, use pip to install: +This package runs under Python >=3.9, use pip to install: ```bash $ pip install csvw diff --git a/RELEASING.md b/RELEASING.md index a06e671..ff332ba 100644 --- a/RELEASING.md +++ b/RELEASING.md @@ -7,6 +7,14 @@ Releasing csvw tox -r ``` +- Run the integration test from the README: +```python +import json +from csvw import CSVW +data = CSVW('https://raw.githubusercontent.com/cldf/csvw/master/tests/fixtures/test.tsv') +print(json.dumps(data.to_json(minimal=True), indent=4)) +``` + - Make sure flake8 passes: ```shell flake8 src diff --git a/src/csvw/utils.py b/src/csvw/utils.py index bcdf683..5dcce5c 100644 --- a/src/csvw/utils.py +++ b/src/csvw/utils.py @@ -8,11 +8,12 @@ import keyword import logging import warnings +import contextlib import collections import dataclasses import unicodedata import urllib.request -from typing import Callable, Any, Union, Optional +from typing import Callable, Any, Union, Optional, Literal HTTP_REQUEST_TIMEOUT = 10 @@ -58,10 +59,31 @@ def iter_links(cls, s): yield cls.from_string(single if i == 0 else '<' + single) +@contextlib.contextmanager +def urlopen( + url, + method: Optional[Literal['HEAD', 'GET']] = 'GET', + timeout=HTTP_REQUEST_TIMEOUT, +): + """ + Open URLs + - without raising an exception on HTTP errors, + - passing a specific User-Agent header, + - specifying a timeout. + """ + from csvw import __version__ + + class NonRaisingHTTPErrorProcessor(urllib.request.HTTPErrorProcessor): + http_response = https_response = lambda self, request, response: response + + opener = urllib.request.build_opener(NonRaisingHTTPErrorProcessor) + opener.addheaders = [('User-agent', f'csvw/{__version__}')] + yield opener.open(urllib.request.Request(url, method=method), timeout=timeout) + + def request_head(url) -> tuple[str, list[LinkHeader]]: """Makes a HEAD request and returns the relevant response data.""" - req = urllib.request.Request(url, method='HEAD') - with urllib.request.urlopen(req, timeout=HTTP_REQUEST_TIMEOUT) as response: + with urlopen(url) as response: links = [] for mult in response.info().get_all('Link') or []: links.extend(LinkHeader.iter_links(mult)) @@ -94,7 +116,7 @@ def json(self) -> Any: def request_get(url: str) -> GetResponse: """Makes a GET request.""" - with urllib.request.urlopen(url, timeout=HTTP_REQUEST_TIMEOUT) as response: + with urlopen(url) as response: return GetResponse.from_response(response) diff --git a/tests/test_utils.py b/tests/test_aautils.py similarity index 75% rename from tests/test_utils.py rename to tests/test_aautils.py index dfb5c50..9d6c50c 100644 --- a/tests/test_utils.py +++ b/tests/test_aautils.py @@ -1,3 +1,4 @@ +import urllib.error import contextlib import pytest @@ -22,16 +23,24 @@ def test_LinkHeader_mult(): assert res[0].url == 'url1' and res[1].url == 'url2' +def test_urlopen(): + try: + with utils.urlopen('https://httpbin.org/delay/2', timeout=0.01) as res: + assert res.status in (404, 201) # pragma: no cover + except urllib.error.URLError as e: + assert ('timed out' in str(e)) or ('failure in name resolution' in str(e)) + + def test_request_get(mocker): @contextlib.contextmanager - def urlopen(url, **_): + def urlopen(url): yield mocker.Mock( read=lambda: '"äöü"'.encode('latin1'), status=201, headers=mocker.Mock(get_content_charset=lambda: 'latin1') ) - mocker.patch('csvw.utils.urllib.request.urlopen', urlopen) + mocker.patch('csvw.utils.urlopen', urlopen) res = utils.request_get('url') assert res.text == '"äöü"' assert res.json() == "äöü" @@ -46,11 +55,10 @@ def info(): get_content_type=lambda: 'text/html') @contextlib.contextmanager - def urlopen(url, **_): + def urlopen(url): yield HTTPMessage() - mocker.patch('csvw.utils.urllib.request.Request', mocker.Mock()) - mocker.patch('csvw.utils.urllib.request.urlopen', urlopen) + mocker.patch('csvw.utils.urlopen', urlopen) content_type, links = utils.request_head('url') assert content_type == 'text/html' assert len(links) == 2 From 62e703081cb990625f52124053d82e64b4b6d7da Mon Sep 17 00:00:00 2001 From: Robert Forkel Date: Fri, 6 Mar 2026 16:41:35 +0100 Subject: [PATCH 14/17] more linting --- src/csvw/frictionless.py | 7 ++----- src/csvw/utils.py | 8 ++++---- tests/test_frictionless.py | 14 +++++++------- tests/{test_aautils.py => test_utils.py} | 0 4 files changed, 13 insertions(+), 16 deletions(-) rename tests/{test_aautils.py => test_utils.py} (100%) diff --git a/src/csvw/frictionless.py b/src/csvw/frictionless.py index cef86de..23d4291 100644 --- a/src/csvw/frictionless.py +++ b/src/csvw/frictionless.py @@ -10,7 +10,7 @@ """ import json import pathlib -from typing import Any, TYPE_CHECKING, Optional +from typing import Any, TYPE_CHECKING if TYPE_CHECKING: from csvw.metadata import TableGroup # pragma: no cover @@ -180,9 +180,7 @@ def __init__(self, spec, directory=None): self.json = spec - def to_tablegroup(self, cls: Optional[type] = None) -> 'TableGroup': # pylint: disable=C0116 - from csvw import TableGroup # pylint: disable=C0415 - + def to_tablegroup(self, cls: type) -> 'TableGroup': # pylint: disable=C0116 md: dict[str, Any] = {'@context': "http://www.w3.org/ns/csvw"} # Package metadata: md['dc:replaces'] = json.dumps(self.json) @@ -228,7 +226,6 @@ def to_tablegroup(self, cls: Optional[type] = None) -> 'TableGroup': # pylint: } md['tables'].append(table) - cls = cls or TableGroup res = cls.fromvalue(md) res._fname = self.dir / 'csvw-metadata.json' # pylint: disable=W0212 return res diff --git a/src/csvw/utils.py b/src/csvw/utils.py index 5dcce5c..bd13f00 100644 --- a/src/csvw/utils.py +++ b/src/csvw/utils.py @@ -71,13 +71,12 @@ def urlopen( - passing a specific User-Agent header, - specifying a timeout. """ - from csvw import __version__ - class NonRaisingHTTPErrorProcessor(urllib.request.HTTPErrorProcessor): - http_response = https_response = lambda self, request, response: response + """Don't raise exceptions on HTTP errors.""" + http_response = https_response = lambda self, req, res: res # pylint: disable=C3001 opener = urllib.request.build_opener(NonRaisingHTTPErrorProcessor) - opener.addheaders = [('User-agent', f'csvw/{__version__}')] + opener.addheaders = [('User-agent', 'csvw/4.0.0')] yield opener.open(urllib.request.Request(url, method=method), timeout=timeout) @@ -105,6 +104,7 @@ def __post_init__(self): @classmethod def from_response(cls, response) -> 'GetResponse': + """Initialize instance with data from a urllib response.""" content = response.read() text = content.decode(response.headers.get_content_charset() or 'utf-8') return cls(status_code=response.status, content=content, text=text) diff --git a/tests/test_frictionless.py b/tests/test_frictionless.py index 0d1a8ea..75a8af2 100644 --- a/tests/test_frictionless.py +++ b/tests/test_frictionless.py @@ -43,10 +43,10 @@ def test_DataPackage_init(): warnings.simplefilter('ignore') dp = DataPackage(dict(resources=[], name='x')) dp = DataPackage(dp) - assert dp.to_tablegroup().common_props['dc:identifier'] == 'x' + assert dp.to_tablegroup(TableGroup).common_props['dc:identifier'] == 'x' dp = DataPackage('{"resources": [], "name": "x", "id": "y"}') - assert dp.to_tablegroup().common_props['dc:identifier'] == 'y' - assert dp.to_tablegroup().common_props['dc:title'] == 'x' + assert dp.to_tablegroup(TableGroup).common_props['dc:identifier'] == 'y' + assert dp.to_tablegroup(TableGroup).common_props['dc:title'] == 'x' def test_DataPackage_constraints(datafactory): @@ -54,23 +54,23 @@ def test_DataPackage_constraints(datafactory): warnings.simplefilter('ignore') dp = datafactory([{'name': 'col', 'constraints': {'maxLength': 3}}], [['abcd']]) with pytest.raises(ValueError): - _ = list(DataPackage(dp).to_tablegroup().tables[0]) + _ = list(DataPackage(dp).to_tablegroup(TableGroup).tables[0]) dp = datafactory([{'name': 'col', 'constraints': {'pattern': '[a-z]{2}'}}], [['abcd']]) with pytest.raises(ValueError): - _ = list(DataPackage(dp).to_tablegroup().tables[0]) + _ = list(DataPackage(dp).to_tablegroup(TableGroup).tables[0]) dp = datafactory( [{'name': 'col', 'type': 'year', 'constraints': {'pattern': '[2].*'}}], [['1990']]) with pytest.raises(ValueError): - _ = list(DataPackage(dp).to_tablegroup().tables[0]) + _ = list(DataPackage(dp).to_tablegroup(TableGroup).tables[0]) def test_DataPackage(tmpfixtures): with warnings.catch_warnings(): warnings.simplefilter('ignore') dp = DataPackage(tmpfixtures / 'datapackage.json') - tg = dp.to_tablegroup() + tg = dp.to_tablegroup(TableGroup) rows = list(tg.tables[0]) assert len(rows) == 9 assert rows[-1]['Year'] == 2012 diff --git a/tests/test_aautils.py b/tests/test_utils.py similarity index 100% rename from tests/test_aautils.py rename to tests/test_utils.py From 4cfe007219c52af3b265282a0b19e32797a9fedb Mon Sep 17 00:00:00 2001 From: Robert Forkel Date: Fri, 6 Mar 2026 18:55:38 +0100 Subject: [PATCH 15/17] cleaned up tests --- src/csvw/__main__.py | 2 +- src/csvw/db.py | 1 - tests/conftest.py | 115 ++++++++++++++++++++++++++-------------- tests/test_cli.py | 16 +++++- tests/test_datatypes.py | 54 ++++++++----------- tests/test_db.py | 9 +--- tests/test_metadata.py | 14 ++--- 7 files changed, 120 insertions(+), 91 deletions(-) diff --git a/src/csvw/__main__.py b/src/csvw/__main__.py index db87a8e..210333a 100644 --- a/src/csvw/__main__.py +++ b/src/csvw/__main__.py @@ -133,7 +133,7 @@ def csvw2json(args=None, test=False): return exit(0, test=test) # pylint: disable=R1722 -def csvw2sqlite(args=None, test=False): # pragma: no cover +def csvw2sqlite(args=None, test=False): """Convert CSVW to SQLite""" args = parsed_args( csvw2sqlite.__doc__, diff --git a/src/csvw/db.py b/src/csvw/db.py index dea0975..bce1662 100644 --- a/src/csvw/db.py +++ b/src/csvw/db.py @@ -547,7 +547,6 @@ def write(self, *, force=False, _exists_ok=False, _skip_extra=False, **items): continue rows, keys = self._get_rows(t, items[t.name], refs, _skip_extra) insert(db, self.translate, t.name, keys, *rows) - print(refs) for atkey, rows in refs.items(): insert(db, self.translate, atkey[0], atkey[1:], *rows) diff --git a/tests/conftest.py b/tests/conftest.py index 5770d02..52e931b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -22,10 +22,17 @@ def csvw_tests_url(path) -> str: def csvw_tests_path(path) -> pathlib.Path: + """ + We have cloned the csvw test suite locally, to be able to run tests without network. + """ return pathlib.Path(__file__).parent / 'fixtures' / 'csvw' / 'tests' / path def unorder(o): + """ + To make assertions about equality of container instances work, we turn ordered dicts into + regular ones. + """ if isinstance(o, dict): return {k: unorder(v) for k, v in o.items()} if isinstance(o, list): @@ -35,6 +42,21 @@ def unorder(o): @dataclasses.dataclass class CSWVTest: + """ + Python object capturing the information of a CSVW test manifest looking like + { + "id": "manifest-json#test001", + "type": "csvt:ToJsonTest", + "name": "Simple table", + "comment": "The simplest possible table without metadata", + "approval": "rdft:Approved", + "option": { + "noProv": true + }, + "action": "test001.csv", + "result": "test001.json" + } + """ id: str type: Literal[ 'csvt:NegativeJsonTest', @@ -60,6 +82,12 @@ def __post_init__(self): self.action = csvw_tests_url(self.action) self.result = csvw_tests_url(self.result) if self.result else None + @property + def csvw_instance(self) -> CSVW: + return CSVW(self.action, + md_url=self.option.get('metadata'), + validate=self.is_validation_test) + @property def is_json_test(self): return 'Json' in self.type @@ -73,6 +101,9 @@ def number(self): # pragma: no cover return int(self.id.replace('test', '')) def request_head(self, _): + """ + Used to patch `utils.request_head` in order to run tests without actual HTTP requests. + """ if self.contentType: return self.contentType, [] if self.httpLink: @@ -81,6 +112,9 @@ def request_head(self, _): @staticmethod def request_get(url): + """ + Used to patch `utils.request_get` in order to run tests without actual HTTP requests. + """ url = urllib.parse.urlparse(url) if url.netloc == 'www.w3.org': if url.path.startswith('/2013/csvw/tests/'): @@ -88,16 +122,21 @@ def request_get(url): if p.exists(): return GetResponse(content=p.read_bytes()) elif url.path == '/.well-known/csvm': - return GetResponse( - text="""{+url}-metadata.json - csv-metadata.json - {+url}.json - csvm.json - """) + return GetResponse(text="""{+url}-metadata.json +csv-metadata.json +{+url}.json +csvm.json +""") return GetResponse(status_code=404) raise ValueError(url) # pragma: no cover - def _run(self): + def run(self, mocker): + # Mock HTTP requests: + mocker.patch('csvw.metadata.utils.request_head', self.request_head) + mocker.patch('csvw.metadata.utils.request_get', self.request_get) + mocker.patch('csvw.utils.request_get', self.request_get) + + # Prepare the context for running the test: with contextlib.ExitStack() as stack: if self.type == "csvt:ToJsonTestWithWarnings": stack.enter_context(pytest.warns(UserWarning)) @@ -121,8 +160,8 @@ def _run(self): elif self.type == "csvt:WarningValidationTest": stack.enter_context(pytest.warns(UserWarning)) - ds = CSVW( - self.action, md_url=self.option.get('metadata'), validate=self.is_validation_test) + ds = self.csvw_instance + if self.is_validation_test: if self.type == 'csvt:PositiveValidationTest': assert ds.is_valid @@ -135,31 +174,32 @@ def _run(self): elif self.is_json_test: assert unorder(ds.to_json(minimal=self.option.get('minimal'))) == \ - unorder(get_json(self.result)), \ - '{}: {}'.format(self.id, self.name) - - def run(self, mocker): - mocker.patch('csvw.metadata.utils.request_head', self.request_head) - mocker.patch('csvw.metadata.utils.request_get', self.request_get) - mocker.patch('csvw.utils.request_get', self.request_get) - self._run() + unorder(get_json(self.result)), f'{self.id}: {self.name}' def pytest_generate_tests(metafunc): + def iter_tests(manifest, cond, xfail): + for t in json.loads(csvw_tests_path(manifest).read_text(encoding='utf8'))['entries']: + test = CSWVTest(**t) + if cond(test): + if test.number in xfail: + yield pytest.param(test, marks=pytest.mark.xfail) + else: + yield test + + # We xfail some tests, which test ambiguous parts of the spec, or require behaviour which seems + # overly complex to implement. if "csvwjsontest" in metafunc.fixturenames: + number = metafunc.config.getoption("number") + testname = "csvwjsontest" xfail = { 193: "Why do we have to format durations with particular comps, e.g. PT130M and not " "PT2H10M?", } - number = metafunc.config.getoption("number") - tests = json.loads(csvw_tests_path('manifest-json.jsonld').read_text(encoding='utf8')) - metafunc.parametrize( - "csvwjsontest", - [pytest.param(test, marks=pytest.mark.xfail) if test.number in xfail else test - for test in [CSWVTest(**t) for t in tests['entries']] - if number is None or number == test.number]) - - if "csvwnonnormtest" in metafunc.fixturenames: + manifest = 'manifest-json.jsonld' + condition = lambda t: number is None or number == t.number + elif "csvwnonnormtest" in metafunc.fixturenames: + testname = "csvwnonnormtest" xfail = { 20: "Don't understand the test.", 21: "Don't understand the test. If not trimming makes reading the data impossible, " @@ -171,22 +211,19 @@ def pytest_generate_tests(metafunc): 58: "Again, the trimming seems to not be expected?", 59: "Again, the trimming seems to not be expected?", } - tests = json.loads(csvw_tests_path('manifest-nonnorm.jsonld').read_text(encoding='utf8')) - metafunc.parametrize( - "csvwnonnormtest", - [pytest.param(test, marks=pytest.mark.xfail) if test.number in xfail else test - for test in [CSWVTest(**t) for t in tests['entries']] if 'Json' in test.type]) - - if "csvwvalidationtest" in metafunc.fixturenames: + manifest = 'manifest-nonnorm.jsonld' + condition = lambda t: 'Json' in t.type + elif "csvwvalidationtest" in metafunc.fixturenames: + testname = "csvwvalidationtest" xfail = { 92: "Can't detect malformed JSON if we don't know whether we are fed a metadata or a " "CSV file to begin with!", 124: "Hm. Didn't we have this as ToJson test with warnings?", } + manifest = 'manifest-validation.jsonld' number = metafunc.config.getoption("number") - tests = json.loads(csvw_tests_path('manifest-validation.jsonld').read_text(encoding='utf8')) - metafunc.parametrize( - "csvwvalidationtest", - [pytest.param(test, marks=pytest.mark.xfail) if test.number in xfail else test - for test in [CSWVTest(**t) for t in tests['entries']] - if number is None or number == test.number]) + condition = lambda t: number is None or number == t.number + else: + return + + metafunc.parametrize(testname, list(iter_tests(manifest, condition, xfail))) diff --git a/tests/test_cli.py b/tests/test_cli.py index d4ded1b..bb1eecb 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,12 +1,13 @@ import os import json -import shutil import pathlib +import sqlite3 import argparse import pytest -from csvw.__main__ import csvw2json, csvw2datasette, csvwdescribe, csvwvalidate, csvw2markdown +from csvw.__main__ import ( + csvw2json, csvw2datasette, csvwdescribe, csvwvalidate, csvw2markdown, csvw2sqlite) def relpath(fname): @@ -82,3 +83,14 @@ def test_csvwdescribe(csvname, tsvname, capsys): def test_csvw2datasette(tmp_path, mdname): run(csvw2datasette, url=mdname, outdir=tmp_path) assert tmp_path.joinpath('datasette.db').exists() + + +def test_csvw2sqlite(tmp_path, mdname): + out = tmp_path / 'db.sqlite' + run(csvw2sqlite, url=mdname, output=out) + assert out.exists() + conn = sqlite3.connect(out) + cu = conn.cursor() + cu.execute('select count(*) from `csv.txt`') + assert cu.fetchone()[0] == 2 + conn.close() diff --git a/tests/test_datatypes.py b/tests/test_datatypes.py index c1fa6bd..0becfb2 100644 --- a/tests/test_datatypes.py +++ b/tests/test_datatypes.py @@ -81,9 +81,28 @@ def test_roundtrip(datatype, val, obj, roundtrip): assert t.formatted(o) == val +@pytest.mark.parametrize( + 'spec', + [ + {'base': 'string', 'length': 5, 'minLength': 6}, + {'base': 'string', 'length': 5, 'maxLength': 4}, + {'base': 'string', 'maxLength': 5, 'minLength': 6}, + 5, + {'base': 'datetime', 'format': 'd.M.yyyy HH:mm:ss.SGS'}, + {'base': 'datetime', 'format': 'd.M.yyyy HH:mm:ss.S XxX'}, + {'base': 'dateTimeStamp', 'format': 'd.M.yyyy HH:mm:ss.SSS'}, + ] +) +def test_invalid_spec(spec): + with pytest.raises(ValueError): + Datatype.fromvalue(spec) + + @pytest.mark.parametrize( 'datatype,val', [ + ({'base': 'string', 'maxLength': 4}, 'abcdefg'), + ({'base': 'string', 'minLength': 4}, 'abc'), ({'base': 'nonNegativeInteger'}, '-1'), ({'base': 'positiveInteger'}, '0'), ({'base': 'double', 'minimum': 10}, '3.1'), @@ -100,7 +119,7 @@ def test_roundtrip(datatype, val, obj, roundtrip): ({'base': 'hexBinary'}, 'spam'), ] ) -def test_invalid(datatype, val): +def test_invalid_value(datatype, val): t = Datatype.fromvalue(datatype) with pytest.raises(ValueError): t.read(val) @@ -133,7 +152,7 @@ def test_number(): assert t.formatted(v) == '3' with pytest.raises(ValueError): t.validate(12) - + t = Datatype.fromvalue( {'base': 'decimal', 'format': {'groupChar': '.', 'decimalChar': ','}}) with warnings.catch_warnings(): @@ -152,38 +171,10 @@ def test_number(): assert t.formatted(decimal.Decimal('-3.1415')) == '3,14-' -def test_errors(): - with pytest.raises(ValueError): - Datatype.fromvalue({'base': 'string', 'length': 5, 'minLength': 6}) - - with pytest.raises(ValueError): - Datatype.fromvalue({'base': 'string', 'length': 5, 'maxLength': 4}) - - with pytest.raises(ValueError): - dt = Datatype.fromvalue({'base': 'string', 'minLength': 4}) - dt.validate('abc') - - with pytest.raises(ValueError): - dt = Datatype.fromvalue({'base': 'string', 'maxLength': 4}) - dt.validate('abcdefg') - - with pytest.raises(ValueError): - Datatype.fromvalue({'base': 'string', 'maxLength': 5, 'minLength': 6}) - - with pytest.raises(ValueError): - Datatype.fromvalue(5) - - def test_date(): with pytest.warns(UserWarning): Datatype.fromvalue({'base': 'date', 'format': '2012+12+12'}) - with pytest.raises(ValueError): - Datatype.fromvalue({'base': 'datetime', 'format': 'd.M.yyyy HH:mm:ss.SGS'}) - - with pytest.raises(ValueError): - Datatype.fromvalue({'base': 'datetime', 'format': 'd.M.yyyy HH:mm:ss.S XxX'}) - t = Datatype.fromvalue({'base': 'datetime', 'format': 'd.M.yyyy HH:mm:ss.SSS'}) assert t.formatted(datetime.datetime(2012, 12, 12, 12, 12, 12, microsecond=12345)) == \ '12.12.2012 12:12:12.012' @@ -203,9 +194,6 @@ def test_date(): assert t.formatted(t.parse('2012-12-01T12:12:12.123456+05:30')) == \ '2012-12-01T12:12:12.123456+05:30' - with pytest.raises(ValueError): - Datatype.fromvalue({'base': 'dateTimeStamp', 'format': 'd.M.yyyy HH:mm:ss.SSS'}) - t = Datatype.fromvalue({'base': 'duration', 'format': 'P[1-5]Y'}) with pytest.raises(ValueError): t.parse('P8Y') diff --git a/tests/test_db.py b/tests/test_db.py index 9b68b8f..bf5cb98 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -15,14 +15,7 @@ @pytest.fixture def tg(): - return TableGroup.fromvalue({'tables': [ - { - 'url': 'data', - 'tableSchema': { - 'columns': [] - } - } - ]}) + return TableGroup.fromvalue({'tables': [{'url': 'data', 'tableSchema': {'columns': []}}]}) @pytest.fixture diff --git a/tests/test_metadata.py b/tests/test_metadata.py index d924230..5faad2d 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -42,7 +42,7 @@ def test_Link(link, base, res): assert csvw.Link(link).resolve(base) == res -class TestColumnEquality(object): +class TestColumnEquality: def test_get_column(self): t1 = csvw.Table.fromvalue({ @@ -58,7 +58,7 @@ def test_get_column(self): assert t1.tableSchema.columns[0] == t2.tableSchema.columns[0] -class TestColumnAccess(object): +class TestColumnAccess: def test_get_column(self): t = csvw.Table.fromvalue({ @@ -77,7 +77,7 @@ def test_get_column(self): assert t.get_column('xyz').name is None -class TestDialect(object): +class TestDialect: @staticmethod def _roundtrip(t, fpath, *items): @@ -135,7 +135,7 @@ def test_commentPrefix(self, tmp_path): assert res[0]['col1'] == '$val' -class TestNaturalLanguage(object): +class TestNaturalLanguage: def test_string(self): l = csvw.NaturalLanguage('abc') @@ -169,7 +169,7 @@ def test_serialize(self): '{"und": ["\\u00e4", "a"], "de": "\\u00f6"}' -class TestColumn(object): +class TestColumn: def test_read_rite_with_separator(self): col = csvw.Column.fromvalue({'separator': ';', 'null': 'nn'}) @@ -240,7 +240,7 @@ def _load_json(path): return json.load(f) -class TestTable(object): +class TestTable: @staticmethod def _make_table(tmp_path, data=None, metadata=None): @@ -301,7 +301,7 @@ def test_unspecified_column_in_table_without_url(self, tmp_path): list(t.iterdicts(fname=str(data))) -class TestTableGroup(object): +class TestTableGroup: @staticmethod def _make_tablegroup(tmp_path, data=None, metadata=None): From 89fdbdd1797ffcb8c804182a7e33deb929835b1f Mon Sep 17 00:00:00 2001 From: Robert Forkel Date: Wed, 25 Mar 2026 11:25:27 +0100 Subject: [PATCH 16/17] add linting to release procs --- RELEASING.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/RELEASING.md b/RELEASING.md index ff332ba..bdb5ad8 100644 --- a/RELEASING.md +++ b/RELEASING.md @@ -20,6 +20,11 @@ print(json.dumps(data.to_json(minimal=True), indent=4)) flake8 src ``` +- Make sure pylint passes with a score of 10: +```shell +pylint src/csvw +``` + - Make sure docs can be created: ```shell cd docs From a2937ef9fdb42f95782098a7ee063045a31f4dbf Mon Sep 17 00:00:00 2001 From: Robert Forkel Date: Wed, 25 Mar 2026 11:32:24 +0100 Subject: [PATCH 17/17] fix CI --- .github/workflows/python-package.yml | 6 +++--- tests/test_utils.py | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index b0595ee..634450f 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -12,14 +12,14 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.9, "3.10", 3.11, 3.12] + python-version: ["3.10", 3.11, 3.12, 3.13, 3.14] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: submodules: recursive - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/tests/test_utils.py b/tests/test_utils.py index 9d6c50c..e564910 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,3 +1,4 @@ +import os import urllib.error import contextlib @@ -24,6 +25,8 @@ def test_LinkHeader_mult(): def test_urlopen(): + if os.getenv("GITHUB_ACTIONS"): + return # pragma: no cover try: with utils.urlopen('https://httpbin.org/delay/2', timeout=0.01) as res: assert res.status in (404, 201) # pragma: no cover