diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index b0595ee..634450f 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -12,14 +12,14 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.9, "3.10", 3.11, 3.12] + python-version: ["3.10", 3.11, 3.12, 3.13, 3.14] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: submodules: recursive - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/CHANGES b/CHANGES index 403f2eb..849d7b0 100644 --- a/CHANGES +++ b/CHANGES @@ -1,6 +1,19 @@ Changelog ========= +Unreleased +---------- + +- removed dependency on `attrs` + +Backwards incompatibility +~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Since formerly `@attr.s` decorated classes are now dataclasses, any class inheriting from these + will be broken. +- Some functions have been moved to different modules, so imports may be broken. + + Version 3.7.0 ------------- diff --git a/README.md b/README.md index e3f2bdc..18ecf03 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,10 @@ This package provides - a Python API to read and write relational, tabular data according to the [CSV on the Web](https://csvw.org/) specification and - commandline tools for reading and validating CSVW data. +> [!IMPORTANT] +> The Python API provided by `csvw` 4.x is not fully backwards compatible with `csvw` < 4. +> See [CHANGES](CHANGES) for more information. + ## Links @@ -19,7 +23,7 @@ This package provides ## Installation -This package runs under Python >=3.8, use pip to install: +This package runs under Python >=3.9, use pip to install: ```bash $ pip install csvw diff --git a/RELEASING.md b/RELEASING.md index a06e671..bdb5ad8 100644 --- a/RELEASING.md +++ b/RELEASING.md @@ -7,11 +7,24 @@ Releasing csvw tox -r ``` +- Run the integration test from the README: +```python +import json +from csvw import CSVW +data = CSVW('https://raw.githubusercontent.com/cldf/csvw/master/tests/fixtures/test.tsv') +print(json.dumps(data.to_json(minimal=True), indent=4)) +``` + - Make sure flake8 passes: ```shell flake8 src ``` +- Make sure pylint passes with a score of 10: +```shell +pylint src/csvw +``` + - Make sure docs can be created: ```shell cd docs diff --git a/setup.cfg b/setup.cfg index 51eccc6..425d1d8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -23,12 +23,12 @@ classifiers = Natural Language :: English Operating System :: OS Independent Programming Language :: Python :: 3 - Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 Programming Language :: Python :: 3.11 Programming Language :: Python :: 3.12 Programming Language :: Python :: 3.13 + Programming Language :: Python :: 3.14 Programming Language :: Python :: Implementation :: CPython Programming Language :: Python :: Implementation :: PyPy License :: OSI Approved :: Apache Software License @@ -38,16 +38,14 @@ zip_safe = False packages = find: package_dir = = src -python_requires = >=3.8 +python_requires = >=3.9 install_requires = - attrs>=18.1 isodate python-dateutil # Pin until fix for 2.0.0 is released (https://pypi.org/project/rfc3986/#history): rfc3986<2 uritemplate>=3.0.0 babel - requests language-tags rdflib termcolor @@ -76,7 +74,6 @@ test = frictionless pytest>=5 pytest-mock - requests-mock pytest-cov docs = sphinx<7 @@ -112,7 +109,7 @@ show_missing = true skip_covered = true [tox:tox] -envlist = py38, py39, py310, py311, py312, py313 +envlist = py39, py310, py311, py312, py313, py314 isolated_build = true skip_missing_interpreter = true diff --git a/src/csvw/__init__.py b/src/csvw/__init__.py index e888301..96c0298 100644 --- a/src/csvw/__init__.py +++ b/src/csvw/__init__.py @@ -1,4 +1,6 @@ -# csvw - https://w3c.github.io/csvw/primer/ +""" +csvw - https://w3c.github.io/csvw/primer/ +""" from .metadata import ( TableGroup, Table, Column, ForeignKey, Link, NaturalLanguage, Datatype, URITemplate, CSVW, diff --git a/src/csvw/__main__.py b/src/csvw/__main__.py index 3f599f8..210333a 100644 --- a/src/csvw/__main__.py +++ b/src/csvw/__main__.py @@ -1,3 +1,6 @@ +""" +CLI for the csvw package. +""" import sys import json import shutil @@ -9,10 +12,11 @@ from csvw import CSVW, TableGroup from csvw.db import Database -from csvw.utils import metadata2markdown +from csvw.metadata_utils import metadata2markdown def parsed_args(desc, args, *argspecs): + """Add custom arguments to the parser and parse.""" if args is None: # pragma: no cover parser = argparse.ArgumentParser(description=desc) for kw, kwargs in argspecs: @@ -21,23 +25,25 @@ def parsed_args(desc, args, *argspecs): return args -def exit(ret, test=False): +def exit(ret, test=False): # pylint: disable=redefined-builtin + """We don't want to exit the test suite""" if test: return ret sys.exit(ret) # pragma: no cover def csvwdescribe(args=None, test=False): + """Describe a (set of) CSV file(s) with basic CSVW metadata.""" frictionless = shutil.which('frictionless') if not frictionless: # pragma: no cover raise ValueError('The frictionless command must be installed for this functionality!\n' 'Run `pip install frictionless` and try again.') args = parsed_args( - "Describe a (set of) CSV file(s) with basic CSVW metadata.", + csvwdescribe.__doc__, args, - (['--delimiter'], dict(default=None)), - (['csv'], dict(nargs='+', help="CSV files to describe as CSVW TableGroup")), + (['--delimiter'], {'default': None}), + (['csv'], {'nargs': '+', 'help': "CSV files to describe as CSVW TableGroup"}), ) fargs = ['describe', '--json'] if args.delimiter: @@ -53,19 +59,20 @@ def csvwdescribe(args=None, test=False): dp = json.loads(subprocess.check_output([frictionless] + fargs + args.csv)) if onefile: - dp = dict(resources=[dp], profile='data-package') + dp = {'resources': [dp], 'profile': 'data-package'} tg = TableGroup.from_frictionless_datapackage(dp) print(json.dumps(tg.asdict(), indent=4)) - return exit(0, test=test) + return exit(0, test=test) # pylint: disable=R1722 def csvwvalidate(args=None, test=False): + """Validate a (set of) CSV file(s) described by CSVW metadata.""" args = parsed_args( - "Validate a (set of) CSV file(s) described by CSVW metadata.", + csvwvalidate.__doc__, args, - (['url'], dict(help='URL or local path to CSV or JSON metadata file.')), - (['-v', '--verbose'], dict(action='store_true', default=False)), + (['url'], {'help': 'URL or local path to CSV or JSON metadata file.'}), + (['-v', '--verbose'], {'action': 'store_true', 'default': False}), ) ret = 0 try: @@ -83,15 +90,16 @@ def csvwvalidate(args=None, test=False): print(colored('FAIL', 'red', attrs=['bold'])) if args.verbose: print(colored(str(e), 'blue')) - return exit(ret, test=test) + return exit(ret, test=test) # pylint: disable=R1722 def csvw2datasette(args=None, test=False): + """Convert CSVW to data for datasette (https://datasette.io/).""" args = parsed_args( - "Convert CSVW to data for datasette (https://datasette.io/).", + csvw2datasette.__doc__, args, - (['url'], dict(help='URL or local path to CSV or JSON metadata file.')), - (['-o', '--outdir'], dict(type=pathlib.Path, default=pathlib.Path('.'))), + (['url'], {'help': 'URL or local path to CSV or JSON metadata file.'}), + (['-o', '--outdir'], {'type': pathlib.Path, 'default': pathlib.Path('.')}), ) dbname, mdname = 'datasette.db', 'datasette-metadata.json' csvw = CSVW(args.url) @@ -99,64 +107,68 @@ def csvw2datasette(args=None, test=False): db.write_from_tg() md = {} for k in ['title', 'description', 'license']: - if 'dc:{}'.format(k) in csvw.common_props: - md[k] = csvw.common_props['dc:{}'.format(k)] - # FIXME: flesh out, see https://docs.datasette.io/en/stable/metadata.html + if f'dc:{k}' in csvw.common_props: + md[k] = csvw.common_props[f'dc:{k}'] args.outdir.joinpath(mdname).write_text(json.dumps(md, indent=4)) - print("""Run - datasette {} --metadata {} -and open your browser at - http://localhost:8001/ -to browse the data. -""".format(args.outdir / dbname, args.outdir / mdname)) - return exit(0, test=test) + for line in [ + "Run", + f" datasette {args.outdir / dbname} --metadata {args.outdir / mdname}", + "and open your browser at", + " http://localhost:8001/", + "to browse the data.", + ]: + print(line) + return exit(0, test=test) # pylint: disable=R1722 def csvw2json(args=None, test=False): + """Convert CSVW to JSON, see https://w3c.github.io/csvw/csv2json/""" args = parsed_args( - "Convert CSVW to JSON, see https://w3c.github.io/csvw/csv2json/", + csvw2json.__doc__, args, - (['url'], dict(help='URL or local path to CSV or JSON metadata file.')), + (['url'], {'help': 'URL or local path to CSV or JSON metadata file.'}), ) csvw = CSVW(args.url) print(json.dumps(csvw.to_json(), indent=4)) - return exit(0, test=test) + return exit(0, test=test) # pylint: disable=R1722 -def csvw2sqlite(args=None, test=False): # pragma: no cover +def csvw2sqlite(args=None, test=False): + """Convert CSVW to SQLite""" args = parsed_args( - "Convert CSVW to JSON, see https://w3c.github.io/csvw/csv2json/", + csvw2sqlite.__doc__, args, ( ['url'], - dict(help='URL or local path to CSVW metadata file describing a TableGroup.\n\n' - 'Note that not all valid CSVW datasets can be converted to SQLite. One ' - 'limitation is that all tables which are referenced by foreign keys must ' - 'have a primary key.')), + {'help': 'URL or local path to CSVW metadata file describing a TableGroup.\n\n' + 'Note that not all valid CSVW datasets can be converted to SQLite. One ' + 'limitation is that all tables which are referenced by foreign keys must ' + 'have a primary key.'}), ( ['output'], - dict(help='Path for the generated SQLite database file.')), + {'help': 'Path for the generated SQLite database file.'}), ) tg = TableGroup.from_file(args.url) db = Database(tg, args.output) db.write_from_tg(_force=True) - return exit(0, test=test) + return exit(0, test=test) # pylint: disable=R1722 def csvw2markdown(args=None, test=False): + """Create a Markdown document containing the CSVW metadata in human readable form.""" args = parsed_args( "Convert CSVW to JSON, see https://w3c.github.io/csvw/csv2json/", args, ( ['url'], - dict(help='URL or local path to CSVW metadata file describing a TableGroup.\n\n' - 'Note that not all valid CSVW datasets can be converted to SQLite. One ' - 'limitation is that all tables which are referenced by foreign keys must ' - 'have a primary key.')), + {'help': 'URL or local path to CSVW metadata file describing a TableGroup.\n\n' + 'Note that not all valid CSVW datasets can be converted to SQLite. One ' + 'limitation is that all tables which are referenced by foreign keys must ' + 'have a primary key.'}), ) tg = TableGroup.from_file(args.url) print(metadata2markdown(tg, link_files=True)) - return exit(0, test=test) + return exit(0, test=test) # pylint: disable=R1722 if __name__ == '__main__': # pragma: no cover diff --git a/src/csvw/_compat.py b/src/csvw/_compat.py new file mode 100644 index 0000000..0386fff --- /dev/null +++ b/src/csvw/_compat.py @@ -0,0 +1,24 @@ +""" +Functionality to address python compatibility issues. +""" +import re +import sys +import datetime + + +if (sys.version_info.major, sys.version_info.minor) >= (3, 11): # pragma: no cover + fromisoformat = datetime.datetime.fromisoformat +else: + def fromisoformat(s: str) -> datetime.datetime: # pragma: no cover + """Somewhat hacky backport of the more full-fledged date parsing support in py3.11.""" + s = s.replace('Z', '+00:00') + ms_p = re.compile(r'(?P\.[0-9]+)') + m = ms_p.search(s) + ms = None + if m: + s = ms_p.sub('', s) + ms = float(f'0{ms}') + res = datetime.datetime.fromisoformat(s) + if ms: + res = res.replace(microsecond=(ms * 1000000) % 1000000) + return res diff --git a/src/csvw/datatypes.py b/src/csvw/datatypes.py index ad17973..6cc8ca2 100644 --- a/src/csvw/datatypes.py +++ b/src/csvw/datatypes.py @@ -1,3 +1,4 @@ +# pylint: disable=C0302 """ We model the hierarchy of basic datatypes using a class hierarchy. @@ -8,11 +9,12 @@ .. seealso:: http://w3c.github.io/csvw/metadata/#datatypes """ +import functools import re import json as _json import math import base64 -import typing +from typing import Optional, TYPE_CHECKING, Any, Callable import decimal as _decimal import binascii import datetime @@ -22,12 +24,15 @@ import isodate import rfc3986 -import dateutil.parser import babel.numbers import babel.dates +from babel.dates import format_date import jsonschema +import dateutil.parser + +from ._compat import fromisoformat -if typing.TYPE_CHECKING: # pragma: no cover +if TYPE_CHECKING: # pragma: no cover import csvw __all__ = ['DATATYPES'] @@ -35,19 +40,19 @@ DATATYPES = {} -def register(cls): +def register(cls): # pylint: disable=C0116 DATATYPES[cls.name] = cls return cls -def to_binary(s, encoding='utf-8'): +def to_binary(s, encoding='utf-8'): # pylint: disable=C0116 if not isinstance(s, bytes): return bytes(s, encoding=encoding) return s # pragma: no cover @register -class anyAtomicType: +class anyAtomicType: # pylint: disable=invalid-name """ A basic datatype consists of @@ -63,27 +68,27 @@ class anyAtomicType: example = 'x' @classmethod - def value_error(cls, v): - raise ValueError('invalid lexical value for {}: {}'.format(cls.name, v)) + def value_error(cls, v): # pylint: disable=C0116 + raise ValueError(f'invalid lexical value for {cls.name}: {v}') def __str__(self) -> str: return self.name @staticmethod - def derived_description(datatype: "csvw.Datatype") -> dict: + def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C0116,W0613 return {} @staticmethod - def to_python(v: str, **kw) -> object: + def to_python(v: str, **_) -> Any: # pylint: disable=C0116 return v # pragma: no cover @staticmethod - def to_string(v: object, **kw) -> str: - return '{}'.format(v) + def to_string(v: object, **_) -> str: # pylint: disable=C0116 + return f'{v}' @register -class string(anyAtomicType): +class string(anyAtomicType): # pylint: disable=invalid-name """ Maps to `str`. @@ -93,25 +98,26 @@ class string(anyAtomicType): name = 'string' @staticmethod - def derived_description(datatype: "csvw.Datatype") -> dict: + def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C0116 if datatype.format: # We wrap a regex specified as `format` property into a group and add `$` to # make sure the whole string is matched when validating. try: - return {'regex': re.compile(r'({})$'.format(datatype.format))} + return { + 'regex': re.compile(r'({})$'.format(datatype.format))} # pylint: disable=C0209 except re.error: warnings.warn('Invalid regex pattern as datatype format') return {} @staticmethod - def to_python(v, regex=None): + def to_python(v, regex=None, **_): # pylint: disable=C0116 if regex and not regex.match(v): string.value_error(v) return v @register -class anyURI(string): +class anyURI(string): # pylint: disable=invalid-name """ Maps to `rfc3986.URIReference`. @@ -129,12 +135,12 @@ class anyURI(string): name = 'anyURI' @staticmethod - def to_python(v, regex=None): + def to_python(v, regex=None, **_): # pylint: disable=C0116 res = string.to_python(v, regex=regex) return rfc3986.URIReference.from_string(res.encode('utf-8')) @staticmethod - def to_string(v, **kw): + def to_string(v, **_): # pylint: disable=C0116 if hasattr(v, 'geturl'): # Presumably a `urllib.parse.ParseResult`. return v.geturl() @@ -146,7 +152,7 @@ def to_string(v, **kw): @register -class NMTOKEN(string): +class NMTOKEN(string): # pylint: disable=invalid-name """ Maps to `str` @@ -164,7 +170,7 @@ class NMTOKEN(string): name = "NMTOKEN" @staticmethod - def to_python(v, regex=None): + def to_python(v, regex=None, **_): # pylint: disable=C0116 v = string.to_python(v, regex=regex) if not re.fullmatch(r'[\w.:-]*', v): NMTOKEN.value_error(v) @@ -172,7 +178,7 @@ def to_python(v, regex=None): @register -class base64Binary(anyAtomicType): +class base64Binary(anyAtomicType): # pylint: disable=invalid-name """ Maps to `bytes` """ @@ -180,24 +186,24 @@ class base64Binary(anyAtomicType): example = 'YWJj' @staticmethod - def to_python(v, **kw): + def to_python(v, **_): # pylint: disable=C0116 try: res = to_binary(v, encoding='ascii') except UnicodeEncodeError: base64Binary.value_error(v[:10]) try: res = base64.decodebytes(res) - except Exception: - raise ValueError('invalid base64 encoding') + except Exception as e: + raise ValueError('invalid base64 encoding') from e return res @staticmethod - def to_string(v, **kw): + def to_string(v, **_): # pylint: disable=C0116 return base64.encodebytes(v).decode().strip() @register -class _binary(base64Binary): +class _binary(base64Binary): # pylint: disable=invalid-name """ Maps to `bytes`. Alias for :class:`base64Binary` """ @@ -205,7 +211,7 @@ class _binary(base64Binary): @register -class hexBinary(anyAtomicType): +class hexBinary(anyAtomicType): # pylint: disable=invalid-name """ Maps to `bytes`. @@ -218,24 +224,24 @@ class hexBinary(anyAtomicType): example = 'ab' @staticmethod - def to_python(v, **kw): + def to_python(v, **_): # pylint: disable=C0116 try: res = to_binary(v, encoding='ascii') except UnicodeEncodeError: hexBinary.value_error(v[:10]) try: res = binascii.unhexlify(res) - except (binascii.Error, TypeError): - raise ValueError('invalid hexBinary encoding') + except (binascii.Error, TypeError) as e: + raise ValueError('invalid hexBinary encoding') from e return res @staticmethod - def to_string(v, **kw): + def to_string(v, **_): # pylint: disable=C0116 return binascii.hexlify(v).decode().upper() @register -class boolean(anyAtomicType): +class boolean(anyAtomicType): # pylint: disable=invalid-name """ Maps to `bool`. @@ -255,7 +261,7 @@ class boolean(anyAtomicType): example = 'false' @staticmethod - def derived_description(datatype: "csvw.Datatype") -> dict: + def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C0116 if datatype.format and isinstance(datatype.format, str) and datatype.format.count('|') == 1: true, false = [[v] for v in datatype.format.split('|')] else: @@ -266,29 +272,30 @@ def derived_description(datatype: "csvw.Datatype") -> dict: return {'true': true, 'false': false} @staticmethod - def to_python(s, true=('true', '1'), false=('false', '0')): - if isinstance(s, bool) or s is None: - return s - if s in true: + def to_python(v, true=('true', '1'), false=('false', '0'), **_): # pylint: disable=C0116 + if isinstance(v, bool) or v is None: + return v + if v in true: return True - if s in false: + if v in false: return False - raise boolean.value_error(s) + raise boolean.value_error(v) @staticmethod - def to_string(v, true=('true', '1'), false=('false', '0')): + def to_string(v, true=('true', '1'), false=('false', '0'), **_): # pylint: disable=C0116 return (true if v else false)[0] -def with_tz(v, func, args, kw): +def with_tz(v, func: Callable[..., datetime.datetime], args: tuple, kw: dict) -> datetime.datetime: + """Handle timezone when parsing a datatime using func.""" tz_pattern = re.compile('(Z|[+-][0-2][0-9]:[0-5][0-9])$') tz = tz_pattern.search(v) - if tz: + if tz: # We split off the timezone and handle it separately. v = v[:tz.start()] tz = tz.groups()[0] res = func(v, *args, **kw) if tz: - dt = dateutil.parser.parse('{}{}'.format(datetime.datetime.now(), tz)) + dt = dateutil.parser.parse(f'{datetime.datetime.now()}{tz}') res = datetime.datetime( res.year, res.month, res.day, res.hour, res.minute, res.second, res.microsecond, dt.tzinfo) @@ -296,7 +303,7 @@ def with_tz(v, func, args, kw): @register -class dateTime(anyAtomicType): +class dateTime(anyAtomicType): # pylint: disable=invalid-name """ Maps to `datetime.datetime`. """ @@ -305,7 +312,7 @@ class dateTime(anyAtomicType): example = '2018-12-10T20:20:20' @staticmethod - def derived_description(datatype: "csvw.Datatype") -> dict: + def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C0116 return dt_format_and_regex(datatype.format) @staticmethod @@ -329,30 +336,29 @@ def _parse(v, cls, regex, tz_marker=None): comps[a] = getattr(d, a) res = cls(**{k: int(v) for k, v in comps.items() if v is not None}) if tz_marker: - # Let dateutils take care of parsing the timezone info: res = res.replace(tzinfo=dateutil.parser.parse(v).tzinfo) return res @staticmethod - def to_python(v, regex=None, fmt=None, tz_marker=None, pattern=None): + def to_python(v, regex=None, tz_marker=None, pattern=None, **_): # pylint: disable=C0116 if pattern and regex: match = regex.match(v) if not match: - raise ValueError('{} -- {} -- {}'.format(pattern, v, regex)) # pragma: + raise ValueError(f'{pattern} -- {v} -- {regex}') # pragma: try: - return dateutil.parser.isoparse(v) + return fromisoformat(v) except ValueError: return dateTime._parse(v, datetime.datetime, regex, tz_marker=tz_marker) @staticmethod - def to_string(v, regex=None, fmt=None, tz_marker=None, pattern=None): + def to_string(v, pattern=None, **_): # pylint: disable=C0116 if pattern: return babel.dates.format_datetime(v, tzinfo=v.tzinfo, format=pattern) return v.isoformat() @register -class _dateTime(dateTime): +class _dateTime(dateTime): # pylint: disable=invalid-name """ Maps to `datetime.datetime`. Alias for :class:`dateTime` """ @@ -360,7 +366,7 @@ class _dateTime(dateTime): @register -class date(dateTime): +class date(dateTime): # pylint: disable=invalid-name """ Maps to `datetime.datetime` (in order to be able to preserve timezone information). """ @@ -368,7 +374,7 @@ class date(dateTime): example = '2018-12-10' @staticmethod - def derived_description(datatype: "csvw.Datatype") -> dict: + def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C0116 try: return dt_format_and_regex(datatype.format or 'yyyy-MM-dd') except ValueError: @@ -376,20 +382,25 @@ def derived_description(datatype: "csvw.Datatype") -> dict: return dt_format_and_regex('yyyy-MM-dd') @staticmethod - def to_python(v, regex=None, fmt=None, tz_marker=None, pattern=None): + def to_python(v, # pylint: disable=C0116 + regex=None, tz_marker=None, pattern=None, fmt=None, **_): return with_tz( - v.strip(), dateTime.to_python, [], dict(regex=regex, fmt=fmt, pattern=pattern)) + v.strip(), dateTime.to_python, [], {'regex': regex, 'fmt': fmt, 'pattern': pattern}) @staticmethod - def to_string(v, regex=None, fmt=None, tz_marker=None, pattern=None): - from babel.dates import format_date + def to_string(v, # pylint: disable=C0116,W0221 + pattern=None, + regex=None, + tz_marker=None, + fmt=None, + **_): if pattern: return format_date(v, format=pattern, locale='en') return dateTime.to_string(v, regex=regex, fmt=fmt, tz_marker=tz_marker, pattern=pattern) @register -class dateTimeStamp(dateTime): +class dateTimeStamp(dateTime): # pylint: disable=invalid-name """ Maps to `datetime.datetime`. """ @@ -397,7 +408,7 @@ class dateTimeStamp(dateTime): example = '2018-12-10T20:20:20' @staticmethod - def derived_description(datatype: "csvw.Datatype") -> dict: + def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C0116 res = dt_format_and_regex(datatype.format or 'yyyy-MM-ddTHH:mm:ss.SSSSSSXXX') if not res['tz_marker']: raise ValueError('dateTimeStamp must have timezone marker') @@ -405,7 +416,7 @@ def derived_description(datatype: "csvw.Datatype") -> dict: @register -class _time(dateTime): +class _time(dateTime): # pylint: disable=invalid-name """ Maps to `datetime.datetime` (in order to be able to preserve timezone information). """ @@ -413,23 +424,23 @@ class _time(dateTime): example = '20:20:20' @staticmethod - def derived_description(datatype: "csvw.Datatype") -> dict: + def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C0116 return dt_format_and_regex(datatype.format or 'HH:mm:ss', no_date=True) @staticmethod - def to_python(v, regex=None, fmt=None, tz_marker=None, pattern=None): + def to_python(v, regex=None, tz_marker=None, pattern=None, **_): # pylint: disable=C0116 if pattern and 'x' in pattern.lower(): - return dateutil.parser.parse('{}T{}'.format(datetime.date.today().isoformat(), v)) + return dateutil.parser.parse(f'{datetime.date.today().isoformat()}T{v}') assert regex is not None - return with_tz(v, dateTime._parse, [datetime.datetime, regex], dict(tz_marker=tz_marker)) + return with_tz(v, dateTime._parse, [datetime.datetime, regex], {'tz_marker': tz_marker}) @staticmethod - def to_string(v, regex=None, fmt=None, tz_marker=None, pattern=None): + def to_string(v, pattern=None, **_): # pylint: disable=C0116 return babel.dates.format_time(v, tzinfo=v.tzinfo, format=pattern) @register -class duration(anyAtomicType): +class duration(anyAtomicType): # pylint: disable=invalid-name """ Maps to `datetime.timedelta`. @@ -446,22 +457,22 @@ class duration(anyAtomicType): example = 'P3Y6M4DT12H30M5S' @staticmethod - def derived_description(datatype: "csvw.Datatype") -> dict: + def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C0116 return {'format': datatype.format} @staticmethod - def to_python(v, format=None, **kw): + def to_python(v, format=None, **_): # pylint: disable=C0116,W0622 if format and not re.match(format, v): raise ValueError return isodate.parse_duration(v) @staticmethod - def to_string(v, format=None, **kw): + def to_string(v, format=None, **_): # pylint: disable=C0116,W0613,W0622 return isodate.duration_isoformat(v) @register -class dayTimeDuration(duration): +class dayTimeDuration(duration): # pylint: disable=invalid-name """ Maps to `datetime.timedelta`. """ @@ -469,7 +480,7 @@ class dayTimeDuration(duration): @register -class yearMonthDuration(duration): +class yearMonthDuration(duration): # pylint: disable=invalid-name """ Maps to `datetime.timedelta`. """ @@ -477,7 +488,7 @@ class yearMonthDuration(duration): @register -class decimal(anyAtomicType): +class decimal(anyAtomicType): # pylint: disable=invalid-name """ Maps to `decimal.Decimal`. @@ -521,18 +532,19 @@ class decimal(anyAtomicType): _reverse_special = {v: k for k, v in _special.items()} @staticmethod - def derived_description(datatype: "csvw.Datatype") -> dict: + def derived_description(datatype: "csvw.Datatype") -> dict: # pylint: disable=C0116 if datatype.format: return datatype.format if isinstance(datatype.format, dict) \ else {'pattern': datatype.format} return {} @staticmethod - def to_python(v, pattern=None, decimalChar=None, groupChar=None): - if isinstance(v, str) and 'e' in v.lower(): - raise ValueError('Invalid value for decimal') - - if isinstance(v, str) and re.search('{0}{0}+'.format(re.escape(groupChar or ',')), v): + def to_python(v, pattern=None, decimalChar=None, groupChar=None): # pylint: disable=C0116,W0221 + if any(( + isinstance(v, str) and 'e' in v.lower(), + isinstance(v, str) and # noqa: W504 + re.search(f"{re.escape(groupChar or ',')}{re.escape(groupChar or ',')}+", v), + )): raise ValueError('Invalid value for decimal') if groupChar is None and pattern and ',' in pattern: @@ -541,8 +553,7 @@ def to_python(v, pattern=None, decimalChar=None, groupChar=None): decimalChar = '.' if pattern and not NumberPattern(pattern).is_valid( v.replace(groupChar or ',', ',').replace(decimalChar or '.', '.')): - raise ValueError( - 'Invalid value "{}" for decimal with pattern "{}"'.format(v, pattern)) + raise ValueError(f'Invalid value "{v}" for decimal with pattern "{pattern}"') factor = 1 if isinstance(v, str): @@ -563,11 +574,12 @@ def to_python(v, pattern=None, decimalChar=None, groupChar=None): return _decimal.Decimal(v) * factor except (TypeError, _decimal.InvalidOperation): decimal.value_error(v) + return None # pragma: no cover @staticmethod - def to_string(v, pattern=None, decimalChar=None, groupChar=None): - if '{}'.format(v) in decimal._reverse_special: - return decimal._reverse_special['{}'.format(v)] + def to_string(v, pattern=None, decimalChar=None, groupChar=None): # pylint: disable=C0116,W0221 + if f'{v}' in decimal._reverse_special: + return decimal._reverse_special[f'{v}'] if pattern: v = babel.numbers.format_decimal(v, pattern, 'en') @@ -589,8 +601,8 @@ def to_string(v, pattern=None, decimalChar=None, groupChar=None): exp = int(exp) zero_padding = '0' * (abs(int(exp)) - 1) sign = '-' if neg else '' - return '{}{}{}.0'.format(sign, digits, zero_padding) if exp > 0 else ( - '{}0.{}{}'.format(sign, zero_padding, digits)) + return f'{sign}{digits}{zero_padding}.0' if exp > 0 \ + else f'{sign}0.{zero_padding}{digits}' if groupChar or decimalChar: def repl(m): @@ -598,33 +610,34 @@ def repl(m): return groupChar if m.group('c') == '.': return decimalChar - r = '(?P[{}])'.format(re.escape((decimalChar or '') + (groupChar or ''))) - v = re.sub(r, repl, v) + raise ValueError(m.group('c')) # pragma: no cover + v = re.sub(r"(?P[,.])", repl, v) return v @register -class integer(decimal): +class integer(decimal): # pylint: disable=invalid-name """ Maps to `int`. """ name = 'integer' - range = None + range: Optional[tuple[int, int]] = None @classmethod - def to_python(cls, v, **kw): + def to_python(cls, v, **kw): # pylint: disable=C0116,W0221 res = decimal.to_python(v, **kw) numerator, denominator = res.as_integer_ratio() if denominator == 1: - if cls.range and not (cls.range[0] <= numerator <= cls.range[1]): - raise ValueError("{} must be an integer between {} and {}, but got ".format( - cls.name, cls.range[0], cls.range[1]), v) + if cls.range and not cls.range[0] <= numerator <= cls.range[1]: # pylint: disable=E1136 + raise ValueError( + f"{cls.name} must be an integer between " + f"{cls.range[0]} and {cls.range[1]}, but got ", v) # pylint: disable=E1136 return numerator raise ValueError('Invalid value for integer') @register -class _int(integer): +class _int(integer): # pylint: disable=invalid-name """ Maps to `int`. Alias for :class:`integer`. """ @@ -632,7 +645,7 @@ class _int(integer): @register -class unsignedInt(integer): +class unsignedInt(integer): # pylint: disable=invalid-name """ Maps to `int`. @@ -651,7 +664,7 @@ class unsignedInt(integer): @register -class unsignedShort(integer): +class unsignedShort(integer): # pylint: disable=invalid-name """ Maps to `int`. @@ -670,7 +683,7 @@ class unsignedShort(integer): @register -class unsignedLong(integer): +class unsignedLong(integer): # pylint: disable=invalid-name """ Maps to `int`. @@ -689,7 +702,7 @@ class unsignedLong(integer): @register -class unsignedByte(integer): +class unsignedByte(integer): # pylint: disable=invalid-name """ Maps to `int`. @@ -711,7 +724,7 @@ class unsignedByte(integer): @register -class short(integer): +class short(integer): # pylint: disable=invalid-name """ Maps to `int`. @@ -730,7 +743,7 @@ class short(integer): @register -class long(integer): +class long(integer): # pylint: disable=invalid-name """ Maps to `int`. @@ -750,7 +763,7 @@ class long(integer): @register -class byte(integer): +class byte(integer): # pylint: disable=invalid-name """ Maps to `int`. @@ -770,7 +783,7 @@ class byte(integer): @register -class nonNegativeInteger(integer): +class nonNegativeInteger(integer): # pylint: disable=invalid-name """ Maps to `int`. """ @@ -779,7 +792,7 @@ class nonNegativeInteger(integer): @register -class positiveInteger(integer): +class positiveInteger(integer): # pylint: disable=invalid-name """ Maps to `int`. """ @@ -788,7 +801,7 @@ class positiveInteger(integer): @register -class nonPositiveInteger(integer): +class nonPositiveInteger(integer): # pylint: disable=invalid-name """ Maps to `int`. """ @@ -798,7 +811,7 @@ class nonPositiveInteger(integer): @register -class negativeInteger(integer): +class negativeInteger(integer): # pylint: disable=invalid-name """ Maps to `int`. """ @@ -808,7 +821,7 @@ class negativeInteger(integer): @register -class _float(anyAtomicType): +class _float(anyAtomicType): # pylint: disable=invalid-name """ Maps to `float`. @@ -831,10 +844,9 @@ def derived_description(datatype: "csvw.Datatype") -> dict: return {} @staticmethod - def to_python(v, pattern=None, **kw): + def to_python(v, pattern=None, **_): # pylint: disable=R1710 if pattern and not NumberPattern(pattern).is_valid(v): - raise ValueError( - 'Invalid value "{}" for number with pattern "{}"'.format(v, pattern)) + raise ValueError(f'Invalid value "{v}" for number with pattern "{pattern}"') try: return float(v) @@ -842,12 +854,12 @@ def to_python(v, pattern=None, **kw): _float.value_error(v) @staticmethod - def to_string(v, **kw): - return '{}'.format(v) + def to_string(v, **_): # pylint: disable=C0116 + return f'{v}' @register -class number(_float): +class number(_float): # pylint: disable=invalid-name """ Maps to `float`. """ @@ -855,7 +867,7 @@ class number(_float): @register -class double(_float): +class double(_float): # pylint: disable=invalid-name """ Maps to `float`. """ @@ -863,7 +875,7 @@ class double(_float): @register -class normalizedString(string): +class normalizedString(string): # pylint: disable=invalid-name """ Maps to `str`. @@ -881,7 +893,7 @@ class normalizedString(string): name = 'normalizedString' @staticmethod - def to_python(v, regex=None): + def to_python(v, regex=None, **_): if v: for c in '\r\n\t': v = v.replace(c, ' ') @@ -898,7 +910,7 @@ class QName(string): @register -class gDay(string): +class gDay(string): # pylint: disable=invalid-name """ Maps to `str`. """ @@ -906,7 +918,7 @@ class gDay(string): @register -class gMonth(string): +class gMonth(string): # pylint: disable=invalid-name """ Maps to `str`. """ @@ -914,7 +926,7 @@ class gMonth(string): @register -class gMonthDay(string): +class gMonthDay(string): # pylint: disable=invalid-name """ Maps to `str`. """ @@ -922,7 +934,7 @@ class gMonthDay(string): @register -class gYear(string): +class gYear(string): # pylint: disable=invalid-name """ Maps to `str`. """ @@ -930,7 +942,7 @@ class gYear(string): @register -class gYearMonth(string): +class gYearMonth(string): # pylint: disable=invalid-name """ Maps to `str`. """ @@ -938,7 +950,7 @@ class gYearMonth(string): @register -class xml(string): +class xml(string): # pylint: disable=invalid-name """ Maps to `str`. """ @@ -946,7 +958,7 @@ class xml(string): @register -class html(string): +class html(string): # pylint: disable=invalid-name """ Maps to `str`. """ @@ -954,7 +966,7 @@ class html(string): @register -class json(string): +class json(string): # pylint: disable=invalid-name """ Maps to `str`, `list` or `dict`, i.e. to the result of `json.loads`. @@ -1006,10 +1018,9 @@ def derived_description(datatype: "csvw.Datatype") -> dict: pass return {} - # FIXME: ignored **kw? # why not just to_python = staticmethod(_json.loads)? @staticmethod - def to_python(v, schema=None, **kw): + def to_python(v, schema=None, **_): # pylint: disable=W0237 res = _json.loads(v, object_pairs_hook=collections.OrderedDict) if schema: try: @@ -1019,10 +1030,17 @@ def to_python(v, schema=None, **kw): return res @staticmethod - def to_string(v, **kw): + def to_string(v, **_): return _json.dumps(v) +def _get_sep(dfmt, options): + for d_sep in options: # Determine the separator used for date components. + if d_sep in dfmt: + return d_sep + return None + + def dt_format_and_regex(fmt, no_date=False): """ @@ -1061,30 +1079,11 @@ def dt_format_and_regex(fmt, no_date=False): "MM.dd.yyyy", # e.g., 03.22.2015 "M.d.yyyy", # e.g., 3.22.2015 } - time_patterns = {"HH:mm:ss", "HHmmss", "HH:mm", "HHmm"} - - # We map dateTime component markers to corresponding fromat specs and regular - # expressions used for formatting and parsing. - translate = { - 'yyyy': ('{dt.year:04d}', '(?P[0-9]{4})'), - 'MM': ('{dt.month:02d}', '(?P[0-9]{2})'), - 'dd': ('{dt.day:02d}', '(?P[0-9]{2})'), - 'M': ('{dt.month}', '(?P[0-9]{1,2})'), - 'd': ('{dt.day}', '(?P[0-9]{1,2})'), - 'HH': ('{dt.hour:02d}', '(?P[0-9]{2})'), - 'mm': ('{dt.minute:02d}', '(?P[0-9]{2})'), - 'ss': ('{dt.second:02d}', '(?P[0-9]{2})'), - } - - for dt_sep in ' T': # Only a single space or "T" may separate date and time format. - # Since space or "T" isn't allowed anywhere else in the format, checking whether - # we are dealing with a date or dateTime format is simple: - if dt_sep in fmt: - break - else: - dt_sep = None - + # Only a single space or "T" may separate date and time format. + # Since space or "T" isn't allowed anywhere else in the format, checking whether + # we are dealing with a date or dateTime format is simple: + dt_sep = _get_sep(fmt, ' T') if dt_sep: dfmt, tfmt = fmt.split(dt_sep) elif no_date: @@ -1103,50 +1102,51 @@ def dt_format_and_regex(fmt, no_date=False): if (dfmt and dfmt not in date_patterns) or (tfmt and tfmt not in time_patterns): raise ValueError(fmt) - regex, format = '', '' # Initialize the output. + regex, format = _get_regex_and_format(dfmt, tfmt, dt_sep, msecs) # pylint: disable=W0622 + return {'regex': re.compile(regex), 'fmt': format, 'tz_marker': tz_marker, 'pattern': pattern} - if dfmt: - for d_sep in '.-/': # Determine the separator used for date components. - if d_sep in dfmt: - break - else: - d_sep = None - if d_sep: - # Iterate over date components, converting them to string format specs and regular - # expressions. - for i, part in enumerate(dfmt.split(d_sep)): +def _get_regex_and_format(dfmt, tfmt, dt_sep, msecs): + def _add_chars(fmt, ff, rr, sep=None): + if sep: + for i, part in enumerate(fmt.split(sep)): if i > 0: - format += d_sep - regex += re.escape(d_sep) + ff += sep + rr += re.escape(sep) f, r = translate[part] - format += f - regex += r + ff += f + rr += r else: - for _, chars in itertools.groupby(dfmt, lambda k: k): + for _, chars in itertools.groupby(fmt, lambda k: k): f, r = translate[''.join(chars)] - format += f - regex += r + ff += f + rr += r + return ff, rr + + # We map dateTime component markers to corresponding fromat specs and regular + # expressions used for formatting and parsing. + translate = { + 'yyyy': ('{dt.year:04d}', '(?P[0-9]{4})'), + 'MM': ('{dt.month:02d}', '(?P[0-9]{2})'), + 'dd': ('{dt.day:02d}', '(?P[0-9]{2})'), + 'M': ('{dt.month}', '(?P[0-9]{1,2})'), + 'd': ('{dt.day}', '(?P[0-9]{1,2})'), + 'HH': ('{dt.hour:02d}', '(?P[0-9]{2})'), + 'mm': ('{dt.minute:02d}', '(?P[0-9]{2})'), + 'ss': ('{dt.second:02d}', '(?P[0-9]{2})'), + } + + regex, format = '', '' # Initialize the output. pylint: disable=redefined-builtin + + if dfmt: + format, regex = _add_chars(dfmt, format, regex, _get_sep(dfmt, '.-/')) if dt_sep: format += dt_sep regex += re.escape(dt_sep) if tfmt: - # For time components the only valid separator is ":". - if ':' in tfmt: - for i, part in enumerate(tfmt.split(':')): - if i > 0: - format += ':' - regex += re.escape(':') - f, r = translate[part] - format += f - regex += r - else: - for _, chars in itertools.groupby(tfmt, lambda k: k): - f, r = translate[''.join(chars)] - format += f - regex += r + format, regex = _add_chars(tfmt, format, regex, ':' if ':' in tfmt else None) # Fractions of seconds are a bit of a problem, because datetime objects only offer # microseconds. @@ -1154,8 +1154,7 @@ def dt_format_and_regex(fmt, no_date=False): format += '.{microsecond:.%s}' % msecs regex += r'(\.(?P[0-9]{1,%s})(?![0-9]))?' % msecs regex += r'(\.(?P[0-9]{%s,})(?![0-9]))?' % (msecs + 1,) - - return {'regex': re.compile(regex), 'fmt': format, 'tz_marker': tz_marker, 'pattern': pattern} + return regex, format class NumberPattern: @@ -1167,36 +1166,51 @@ class NumberPattern: The number of # placeholder characters before the decimal do not matter, since no limit is placed on the maximum number of digits. There should, however, be at least one zero someplace in the pattern. - """ + Example: #,##0.## + + .. seealso:: ``_ + """ def __init__(self, pattern): assert pattern.count(';') <= 1 self.positive, _, self.negative = pattern.partition(';') if not self.negative: self.negative = '-' + self.positive.replace('+', '') - @property - def primary_grouping_size(self): + @functools.cached_property + def primary_grouping_size(self) -> int: + """ + Number of digits in the primary grouping, i.e. the size of the chunk between the + secondary grouping character and the decimal point. + """ comps = self.positive.split('.')[0].split(',') if len(comps) > 1: return comps[-1].count('#') + comps[-1].count('0') - - @property - def secondary_grouping_size(self): + return 0 + + @functools.cached_property + def secondary_grouping_size(self) -> int: + """ + Number of digits in the secondary grouping, i.e. the size of the chunk between two + secondary grouping characters. + """ comps = self.positive.split('.')[0].split(',') if len(comps) > 2: return comps[1].count('#') + comps[1].count('0') return self.primary_grouping_size - @property - def min_digits_before_decimal_point(self): + @functools.cached_property + def min_digits_before_decimal_point(self) -> int: + """Number of 0s before the decimal point in the pattern.""" integral_part = self.positive.split('.')[0] match = re.search('([0]+)$', integral_part) if match: return len(match.groups()[0]) + return 0 - @property - def exponent_digits(self): + @functools.cached_property + def exponent_digits(self) -> int: + """Number of digits in the exponent in the pattern.""" _, _, exponent = self.positive.lower().partition('e') i = 0 for c in exponent: @@ -1208,8 +1222,9 @@ def exponent_digits(self): break return i - @property - def decimal_digits(self): + @functools.cached_property + def decimal_digits(self) -> int: + """Number of decimal digits in the pattern.""" i = 0 _, _, decimal_part = self.positive.partition('.') for c in decimal_part: @@ -1219,8 +1234,9 @@ def decimal_digits(self): break return i - @property - def significant_decimal_digits(self): + @functools.cached_property + def significant_decimal_digits(self) -> int: + """Number of *significant* decimal digits in the pattern, i.e. 0 counts, # does not.""" i = 0 _, _, decimal_part = self.positive.partition('.') for c in decimal_part: @@ -1230,16 +1246,12 @@ def significant_decimal_digits(self): break return i - def is_valid(self, s): - def digits(ss): - return [c for c in ss if c not in '.,E+-%‰'] - - integral_part, _, decimal_part = s.partition('.') - decimal_part, _, exponent = decimal_part.lower().partition('e') - groups = integral_part.split(',') + @staticmethod + def _get_significant(groups): significant, leadingzero, skip = [], False, True + for c in ''.join(groups): - if c in ['+', '-', '%', # fixme: permil + if c in ['+', '-', '%', # fixme: permil # pylint: disable=fixme ]: continue if c == '0' and skip: @@ -1250,14 +1262,47 @@ def digits(ss): significant.append(c) if not significant and leadingzero: significant = ['0'] - if self.min_digits_before_decimal_point and \ - len(significant) < self.min_digits_before_decimal_point: + return significant + + def is_valid(self, s: str) -> bool: + """Validates a string representing a number against the pattern.""" + def digits(ss): + return [c for c in ss if c not in '.,E+-%‰'] + + integral_part, _, decimal_part = s.partition('.') + decimal_part, _, _ = decimal_part.lower().partition('e') + groups = integral_part.split(',') + significant = self._get_significant(groups) + + if any(( + all(( + self.min_digits_before_decimal_point, + len(significant) < self.min_digits_before_decimal_point)), + all(( + self.primary_grouping_size, + groups, + len(digits(groups[-1])) > self.primary_grouping_size)), + all(( + self.primary_grouping_size, + groups, + len(groups) > 1, + len(digits(groups[-1])) < self.primary_grouping_size)), + all(( + decimal_part, + len(digits(decimal_part)) > self.decimal_digits, + )), + all(( + self.significant_decimal_digits, + (not decimal_part) or (len(digits(decimal_part)) < self.significant_decimal_digits), + )), + all(( + self.exponent_digits, + 'e' in s.lower(), + len(digits(s.lower().split('e')[-1])) > self.exponent_digits + )), + )): return False - if self.primary_grouping_size and groups: - if len(digits(groups[-1])) > self.primary_grouping_size: - return False - if len(groups) > 1 and len(digits(groups[-1])) < self.primary_grouping_size: - return False + if self.secondary_grouping_size and len(groups) > 1: for i, group in enumerate(groups[:-1]): if i == 0: @@ -1266,15 +1311,5 @@ def digits(ss): else: if len(digits(group)) != self.secondary_grouping_size: return False - if decimal_part: - if len(digits(decimal_part)) > self.decimal_digits: - return False - if self.significant_decimal_digits: - if (not decimal_part) or (len(digits(decimal_part)) < self.significant_decimal_digits): - return False - - if self.exponent_digits and 'e' in s.lower(): - if len(digits(s.lower().split('e')[-1])) > self.exponent_digits: - return False return True diff --git a/src/csvw/db.py b/src/csvw/db.py index 8a03c62..bce1662 100644 --- a/src/csvw/db.py +++ b/src/csvw/db.py @@ -26,69 +26,64 @@ not enforced by the database. """ import json -import typing +from typing import Optional, Union, Protocol, Callable, Any import decimal import pathlib import sqlite3 import functools import contextlib import collections - -import attr +from collections.abc import Sequence, Iterator +import dataclasses import csvw from csvw.datatypes import DATATYPES -from csvw.metadata import TableGroup +from csvw.metadata import TableGroup, Datatype +from .utils import optcast -def identity(s): +def identity(s): # pylint: disable=C0116 return s +@dataclasses.dataclass +class DBType: + """A DB datatype together with read/write converters.""" + name: str + convert: Callable[[Any], Any] = identity + read: Callable[[Any], Any] = identity + + TYPE_MAP = { - 'string': ( - 'TEXT', - identity, - identity), - 'integer': ( - 'INTEGER', - identity, - identity), - 'boolean': ( - 'INTEGER', - lambda s: s if s is None else int(s), - lambda s: s if s is None else bool(s)), - 'decimal': ( - 'REAL', - lambda s: s if s is None else float(s), - lambda s: s if s is None else decimal.Decimal(s)), - 'hexBinary': ( - 'BLOB', - identity, - identity), + 'string': DBType('TEXT'), + 'integer': DBType('INTEGER'), + 'boolean': DBType('INTEGER', optcast(int), optcast(bool)), + 'decimal': DBType('REAL', optcast(float), optcast(decimal.Decimal)), + 'hexBinary': DBType('BLOB'), } -class SchemaTranslator(typing.Protocol): - def __call__(self, table: str, column: typing.Optional[str] = None) -> str: +class SchemaTranslator(Protocol): # pylint: disable=R0903,C0115 + def __call__(self, table: str, column: Optional[str] = None) -> str: ... # pragma: no cover -class ColumnTranslator(typing.Protocol): +class ColumnTranslator(Protocol): # pylint: disable=R0903,C0115 def __call__(self, column: str) -> str: ... # pragma: no cover -def quoted(*names): - return ','.join('`{0}`'.format(name) for name in names) +def quoted(*names: str) -> str: + """Returns a comma-separated list of quoted schema object names.""" + return ','.join(f'`{name}`' for name in names) def insert(db: sqlite3.Connection, translate: SchemaTranslator, table: str, - keys: typing.Sequence[str], + keys: Sequence[str], *rows: list, - single: typing.Optional[bool] = False): + single: Optional[bool] = False): """ Insert a sequence of rows into a table. @@ -101,13 +96,12 @@ def insert(db: sqlite3.Connection, a time, allowing for more focused debugging output in case of errors. """ if rows: - sql = "INSERT INTO {0} ({1}) VALUES ({2})".format( - quoted(translate(table)), - quoted(*[translate(table, k) for k in keys]), - ','.join(['?' for _ in keys])) + cols = quoted(*[translate(table, k) for k in keys]) + vals = ','.join(['?' for _ in keys]) + sql = f"INSERT INTO {quoted(translate(table))} ({cols}) VALUES ({vals})" try: db.executemany(sql, rows) - except: # noqa: E722 - this is purely for debugging. + except: # noqa: E722 - this is purely for debugging. pylint: disable=bare-except if not single: for row in rows: insert(db, translate, table, keys, row, single=True) @@ -117,37 +111,36 @@ def insert(db: sqlite3.Connection, raise -def select(db: sqlite3.Connection, table: str) -> typing.Tuple[typing.List[str], typing.Sequence]: - cu = db.execute("SELECT * FROM {0}".format(quoted(table))) +def select(db: sqlite3.Connection, table: str) -> tuple[list[str], Sequence]: + """Shortcut to construct and execute simple SELECT statements.""" + cu = db.execute(f"SELECT * FROM {quoted(table)}") cols = [d[0] for d in cu.description] return cols, list(cu.fetchall()) -@attr.s +@dataclasses.dataclass class ColSpec: """ A `ColSpec` captures sufficient information about a :class:`csvw.Column` for the DB schema. """ - name = attr.ib() - csvw_type = attr.ib(default='string', converter=lambda s: s if s else 'string') - separator = attr.ib(default=None) - db_type = attr.ib(default=None) - convert = attr.ib(default=None) - read = attr.ib(default=None) - required = attr.ib(default=False) - csvw = attr.ib(default=None) - - def __attrs_post_init__(self): + name: str + csvw_type: str = 'string' + separator: str = None + db_type: DBType = None + required: bool = False + csvw: Datatype = None + + def __post_init__(self): + self.csvw_type = self.csvw_type or 'string' if self.csvw_type in TYPE_MAP: - self.db_type, self.convert, self.read = TYPE_MAP[self.csvw_type] + self.db_type = TYPE_MAP[self.csvw_type] else: - self.db_type = 'TEXT' - self.convert = DATATYPES[self.csvw_type].to_string - self.read = DATATYPES[self.csvw_type].to_python + self.db_type = DBType( + 'TEXT', DATATYPES[self.csvw_type].to_string, DATATYPES[self.csvw_type].to_python) if self.separator and self.db_type != 'TEXT': - self.db_type = 'TEXT' + self.db_type = DBType('TEXT', self.db_type.convert, self.db_type.read) - def check(self, translate: ColumnTranslator) -> typing.Optional[str]: + def check(self, translate: ColumnTranslator) -> Optional[str]: """ We try to convert as many data constraints as possible into SQLite CHECK constraints. @@ -155,7 +148,7 @@ def check(self, translate: ColumnTranslator) -> typing.Optional[str]: :return: A string suitable as argument of an SQL CHECK constraint. """ if not self.csvw: - return + return None c, cname = self.csvw, translate(self.name) constraints = [] if (c.minimum is not None) or (c.maximum is not None): @@ -165,34 +158,33 @@ def check(self, translate: ColumnTranslator) -> typing.Optional[str]: }.get(self.csvw_type) if c.minimum is not None: if func: - constraints.append("{2}(`{0}`) >= {2}('{1}')".format(cname, c.minimum, func)) + constraints.append(f"{func}(`{cname}`) >= {func}('{c.minimum}')") else: - constraints.append('`{0}` >= {1}'.format(cname, c.minimum)) + constraints.append(f'`{cname}` >= {c.minimum}') if c.maximum is not None: if func: - constraints.append("{2}(`{0}`) <= {2}('{1}')".format(cname, c.maximum, func)) + constraints.append(f"{func}(`{cname}`) <= {func}('{c.maximum}')") else: - constraints.append('`{0}` <= {1}'.format(cname, c.maximum)) + constraints.append(f'`{cname}` <= {c.maximum}') elif any(cc is not None for cc in [c.length, c.minLength, c.maxLength]): if c.length: - constraints.append('length(`{0}`) = {1}'.format(cname, c.length)) + constraints.append(f'length(`{cname}`) = {c.length}') if c.minLength: - constraints.append('length(`{0}`) >= {1}'.format(cname, c.minLength)) + constraints.append(f'length(`{cname}`) >= {c.minLength}') if c.maxLength: - constraints.append('length(`{0}`) <= {1}'.format(cname, c.maxLength)) + constraints.append(f'length(`{cname}`) <= {c.maxLength}') return ' AND '.join(constraints) def sql(self, translate: ColumnTranslator) -> str: + """Format the column metadata suitable for inclusion in a CREATE TABLE statement.""" _check = self.check(translate) - return '`{0}` {1}{2}{3}'.format( - translate(self.name), - self.db_type, - ' NOT NULL' if self.required else '', - ' CHECK ({0})'.format(_check) if _check else '') + null_constraint = ' NOT NULL' if self.required else '' + check_constraint = f' CHECK ({_check})' if _check else '' + return f'`{translate(self.name)}` {self.db_type.name}{null_constraint}{check_constraint}' -@attr.s -class TableSpec(object): +@dataclasses.dataclass +class TableSpec: """ A `TableSpec` captures sufficient information about a :class:`csvw.Table` for the DB schema. @@ -205,16 +197,17 @@ class TableSpec(object): .. seealso:: ``_ """ - name = attr.ib() - columns = attr.ib(default=attr.Factory(list)) - foreign_keys = attr.ib(default=attr.Factory(list)) - many_to_many = attr.ib(default=attr.Factory(collections.OrderedDict)) - primary_key = attr.ib(default=None) + name: str + columns: list[ColSpec] = dataclasses.field(default_factory=list) + foreign_keys: list = dataclasses.field(default_factory=list) + many_to_many: collections.OrderedDict = dataclasses.field( + default_factory=collections.OrderedDict) + primary_key: Optional[list[str]] = None @classmethod def from_table_metadata(cls, table: csvw.Table, - drop_self_referential_fks: typing.Optional[bool] = True) -> 'TableSpec': + drop_self_referential_fks: Optional[bool] = True) -> 'TableSpec': """ Create a `TableSpec` from the schema description of a `csvw.metadata.Table`. @@ -232,12 +225,11 @@ def from_table_metadata(cls, if len(fk.columnReference) == 1 and fk.columnReference[0] in list_valued: # List-valued foreign keys are turned into a many-to-many relation! assert len(fk.reference.columnReference) == 1, \ - 'Composite key {0} in table {1} referenced'.format( - fk.reference.columnReference, - fk.reference.resource) + (f'Composite key {fk.reference.columnReference} in table ' + f'{fk.reference.resource} referenced') assert spec.primary_key and len(spec.primary_key) == 1, \ - 'Table {0} referenced by list-valued foreign key must have non-composite ' \ - 'primary key'.format(spec.name) + (f'Table {spec.name} referenced by list-valued foreign key must have ' + f'non-composite primary key') spec.many_to_many[fk.columnReference[0]] = TableSpec.association_table( spec.name, spec.primary_key[0], @@ -271,13 +263,13 @@ def association_table(cls, atable, apk, btable, bpk) -> 'TableSpec': a column `context`, which stores the name of the foreign key column from which a row in the assocation table was created. """ - afk = ColSpec('{0}_{1}'.format(atable, apk)) - bfk = ColSpec('{0}_{1}'.format(btable, bpk)) + afk = ColSpec(f'{atable}_{apk}') + bfk = ColSpec(f'{btable}_{bpk}') if afk.name == bfk.name: afk.name += '_1' bfk.name += '_2' return cls( - name='{0}_{1}'.format(atable, btable), + name=f'{atable}_{btable}', columns=[afk, bfk, ColSpec('context')], foreign_keys=[ ([afk.name], atable, [apk]), @@ -291,21 +283,25 @@ def sql(self, translate: SchemaTranslator) -> str: :return: The SQL statement to create the table. """ col_translate = functools.partial(translate, self.name) + # Assemble the column specifications: clauses = [col.sql(col_translate) for col in self.columns] + # Then add the constraints: if self.primary_key: - clauses.append('PRIMARY KEY({0})'.format(quoted( - *[col_translate(c) for c in self.primary_key]))) + qcols = quoted(*[col_translate(c) for c in self.primary_key]) + clauses.append(f'PRIMARY KEY({qcols})') for fk, ref, refcols in self.foreign_keys: - clauses.append('FOREIGN KEY({0}) REFERENCES {1}({2}) ON DELETE CASCADE'.format( - quoted(*[col_translate(c) for c in fk]), - quoted(translate(ref)), - quoted(*[translate(ref, c) for c in refcols]))) - return "CREATE TABLE IF NOT EXISTS `{0}` (\n {1}\n)".format( - translate(self.name), ',\n '.join(clauses)) + fkcols = quoted(*[col_translate(c) for c in fk]) + rtable = quoted(translate(ref)) + pkcols = quoted(*[translate(ref, c) for c in refcols]) + clauses.append(f'FOREIGN KEY({fkcols}) REFERENCES {rtable}({pkcols}) ON DELETE CASCADE') + + clauses = ',\n '.join(clauses) + return '\n'.join([ + f"CREATE TABLE IF NOT EXISTS `{translate(self.name)}` (", f"{clauses}", ")"]) def schema(tg: csvw.TableGroup, - drop_self_referential_fks: typing.Optional[bool] = True) -> typing.List[TableSpec]: + drop_self_referential_fks: Optional[bool] = True) -> list[TableSpec]: """ Convert the table and column descriptions of a `TableGroup` into specifications for the DB schema. @@ -317,7 +313,7 @@ def schema(tg: csvw.TableGroup, :return: A pair (tables, reference_tables). """ tables = {} - for tname, table in tg.tabledict.items(): + for table in tg.tabledict.values(): t = TableSpec.from_table_metadata( table, drop_self_referential_fks=drop_self_referential_fks) tables[t.name] = t @@ -343,7 +339,7 @@ def schema(tg: csvw.TableGroup, return list(ordered.values()) -class Database(object): +class Database: """ Represents a SQLite database associated with a :class:`csvw.TableGroup` instance. @@ -365,26 +361,27 @@ class Database(object): def __init__( self, tg: TableGroup, - fname: typing.Optional[typing.Union[pathlib.Path, str]] = None, - translate: typing.Optional[SchemaTranslator] = None, - drop_self_referential_fks: typing.Optional[bool] = True, + fname: Optional[Union[pathlib.Path, str]] = None, + translate: Optional[SchemaTranslator] = None, + drop_self_referential_fks: Optional[bool] = True, ): self.translate = translate or Database.name_translator self.fname = pathlib.Path(fname) if fname else None self.init_schema(tg, drop_self_referential_fks=drop_self_referential_fks) self._connection = None # For in-memory dbs we need to keep the connection! - def init_schema(self, tg, drop_self_referential_fks=True): + def init_schema(self, tg: TableGroup, drop_self_referential_fks: bool = True): + """Inititialize the db schema, possibly ignoring self-referential foreign keys.""" self.tg = tg self.tables = schema( self.tg, drop_self_referential_fks=drop_self_referential_fks) if self.tg else [] @property - def tdict(self) -> typing.Dict[str, TableSpec]: + def tdict(self) -> dict[str, TableSpec]: # pylint: disable=C0116 return {t.name: t for t in self.tables} @staticmethod - def name_translator(table: str, column: typing.Optional[str] = None) -> str: + def name_translator(table: str, column: Optional[str] = None) -> str: """ A callable with this signature can be passed into DB creation to control the names of the schema objects. @@ -396,31 +393,37 @@ def name_translator(table: str, column: typing.Optional[str] = None) -> str: # By default, no translation is done: return column or table - def connection(self) -> typing.Union[sqlite3.Connection, contextlib.closing]: + def connection(self) -> Union[sqlite3.Connection, contextlib.closing]: + """DB connection to be used as context manager.""" if self.fname: return contextlib.closing(sqlite3.connect(str(self.fname))) if not self._connection: self._connection = sqlite3.connect(':memory:') return self._connection - def select_many_to_many(self, db, table, context) -> dict: + def _qt(self, tname: str, cname: Optional[str] = None) -> str: + """Translate and then quote a db schema object.""" + if cname: + return quoted(self.translate(tname, cname)) + return quoted(self.translate(tname)) + + def select_many_to_many(self, db, table, context) -> dict[str, Union[tuple[str, str], str]]: + """Select data from an association table, grouped by first foreign key.""" if context is not None: - context_sql = "WHERE context = '{0}'".format(context) + context_sql = f"WHERE context = '{context}'" else: context_sql = '' - sql = """\ -SELECT {0}, group_concat({1}, ' '), group_concat(COALESCE(context, ''), '||') -FROM {2} {3} GROUP BY {0}""".format( - quoted(self.translate(table.name, table.columns[0].name)), - quoted(self.translate(table.name, table.columns[1].name)), - quoted(self.translate(table.name)), - context_sql) + qt = functools.partial(self._qt, table.name) + sql = (f"SELECT {qt(table.columns[0].name)}, " + f" group_concat({qt(table.columns[1].name)}, ' '), " + f" group_concat(COALESCE(context, ''), '||') " + f"FROM {qt()} {context_sql} GROUP BY {qt(table.columns[0].name)}") cu = db.execute(sql) return { r[0]: [(k, v) if context is None else k for k, v in zip(r[1].split(), r[2].split('||'))] for r in cu.fetchall()} - def separator(self, tname: str, cname: str) -> typing.Optional[str]: + def separator(self, tname: str, cname: str) -> Optional[str]: """ :return: separator for the column specified by db schema names `tname` and `cname`. """ @@ -429,12 +432,14 @@ def separator(self, tname: str, cname: str) -> typing.Optional[str]: for col in self.tdict[name].columns: if self.translate(name, col.name) == cname: return col.separator + return None # pragma: no cover - def split_value(self, tname, cname, value) -> typing.Union[typing.List[str], str, None]: + def split_value(self, tname: str, cname: str, value) -> Union[list[str], str, None]: + """Split a value if a separator is defined for the column.""" sep = self.separator(tname, cname) return (value or '').split(sep) if sep else value - def read(self) -> typing.Dict[str, typing.List[typing.OrderedDict]]: + def read(self) -> dict[str, list[collections.OrderedDict]]: """ :return: A `dict` where keys are SQL table names corresponding to CSVW tables and values \ are lists of rows, represented as dicts where keys are the SQL column names. @@ -443,54 +448,21 @@ def read(self) -> typing.Dict[str, typing.List[typing.OrderedDict]]: with self.connection() as conn: for tname in self.tg.tabledict: # - # FIXME: how much do we want to use DB types? Probably as much as possible! - # Thus we need to convert on write **and** read! + # How much do we want to use DB types? Probably as much as possible! + # Thus we'd need to convert on write **and** read! # - convert, seps, refs = {}, {}, collections.defaultdict(dict) - table = self.tdict[tname] # The TableSpec object. - - # Assemble the conversion dictionary: - for col in table.columns: - convert[self.translate(tname, col.name)] = [col.name, identity] - if col.csvw_type in TYPE_MAP: - convert[self.translate(tname, col.name)][1] = TYPE_MAP[col.csvw_type][2] - else: - convert[self.translate(tname, col.name)][1] = \ - DATATYPES[col.csvw_type].to_python - if col.separator: - if col.csvw_type == 'string': - seps[self.translate(tname, col.name)] = col.separator - else: - seps[self.translate(tname, col.name)] = 'json' - + spec = TableReadSpec(self.tdict[tname], tname, self.translate) # Retrieve the many-to-many relations: - for col, at in table.many_to_many.items(): + for col, at in spec.table.many_to_many.items(): for pk, v in self.select_many_to_many(conn, at, col).items(): - refs[pk][self.translate(tname, col)] = v + spec.references[pk][self.translate(tname, col)] = v cols, rows = select(conn, self.translate(tname)) for row in rows: - d = collections.OrderedDict() - for k, v in zip(cols, row): - if k in seps: - if v is None: - d[k] = None - elif not v: - d[k] = [] - elif seps[k] == 'json': - d[k] = json.loads(v) - else: - d[k] = [convert[k][1](v_) for v_ in (v or '').split(seps[k])] - else: - d[k] = convert[k][1](v) if v is not None else None - pk = d[self.translate(tname, table.primary_key[0])] \ - if table.primary_key and len(table.primary_key) == 1 else None - d.update({k: [] for k in table.many_to_many}) - d.update(refs.get(pk, {})) - res[self.translate(tname)].append(d) + res[self.translate(tname)].append(spec.read_row(zip(cols, row))) return res - def association_table_context(self, table, column, fkey): + def association_table_context(self, _, column, fkey): """ Context for association tables is created calling this method. @@ -507,12 +479,50 @@ def association_table_context(self, table, column, fkey): return fkey, column def write_from_tg(self, _force=False, _exists_ok=False, _skip_extra=False): + """Write the data from the contained tablegroup to a db.""" return self.write( force=_force, _exists_ok=_exists_ok, _skip_extra=_skip_extra, **self.tg.read()) + def _get_rows(self, t, items, refs, _skip_extra): + rows, keys = [], [] + cols = {c.name: c for c in t.columns} + for i, row in enumerate(items): + pk = row[t.primary_key[0]] if t.primary_key and len(t.primary_key) == 1 else None + values = [] + for k, v in row.items(): + if k in t.many_to_many: + assert pk + atkey = tuple([t.many_to_many[k].name] + # noqa: W504 + [c.name for c in t.many_to_many[k].columns]) + # We distinguish None - meaning NULL - and [] - meaning no items - as + # values of list-valued columns. + refs[atkey].extend([ + tuple([pk] + list(self.association_table_context(t, k, vv))) + for vv in (v or [])]) + else: + if k not in cols: + if _skip_extra: + continue + raise ValueError(f'unspecified column {k} found in data') + col = cols[k] + if isinstance(v, list): + # Note: This assumes list-valued columns are of datatype string! + if col.csvw_type == 'string': + v = (col.separator or ';').join( + col.db_type.convert(vv) or '' for vv in v) + else: + v = json.dumps(v) + else: + v = col.db_type.convert(v) if v is not None else None + if i == 0: + keys.append(col.name) + values.append(v) + rows.append(tuple(values)) + return rows, keys + def write(self, *, force=False, _exists_ok=False, _skip_extra=False, **items): """ Creates a db file with the core schema. @@ -522,8 +532,7 @@ def write(self, *, force=False, _exists_ok=False, _skip_extra=False, **items): if self.fname and self.fname.exists(): if not force: raise ValueError('db file already exists, use force=True to overwrite') - else: - self.fname.unlink() + self.fname.unlink() with self.connection() as db: for table in self.tables: @@ -536,46 +545,56 @@ def write(self, *, force=False, _exists_ok=False, _skip_extra=False, **items): for t in self.tables: if t.name not in items: continue - rows, keys = [], [] - cols = {c.name: c for c in t.columns} - for i, row in enumerate(items[t.name]): - pk = row[t.primary_key[0]] \ - if t.primary_key and len(t.primary_key) == 1 else None - values = [] - for k, v in row.items(): - if k in t.many_to_many: - assert pk - at = t.many_to_many[k] - atkey = tuple([at.name] + [c.name for c in at.columns]) - # We distinguish None - meaning NULL - and [] - meaning no items - as - # values of list-valued columns. - for vv in (v or []): - fkey, context = self.association_table_context(t, k, vv) - refs[atkey].append((pk, fkey, context)) - else: - if k not in cols: - if _skip_extra: - continue - else: - raise ValueError( - 'unspecified column {0} found in data'.format(k)) - col = cols[k] - if isinstance(v, list): - # Note: This assumes list-valued columns are of datatype string! - if col.csvw_type == 'string': - v = (col.separator or ';').join( - col.convert(vv) or '' for vv in v) - else: - v = json.dumps(v) - else: - v = col.convert(v) if v is not None else None - if i == 0: - keys.append(col.name) - values.append(v) - rows.append(tuple(values)) + rows, keys = self._get_rows(t, items[t.name], refs, _skip_extra) insert(db, self.translate, t.name, keys, *rows) for atkey, rows in refs.items(): insert(db, self.translate, atkey[0], atkey[1:], *rows) db.commit() + + +@dataclasses.dataclass +class TableReadSpec: + """Bundles data informing the reading of table rows.""" + table: TableSpec + name: str + translate: SchemaTranslator + converters: dict[str, tuple[str, Callable]] = dataclasses.field(default_factory=dict) + separators: dict[str, str] = dataclasses.field(default_factory=dict) + references: dict = dataclasses.field(default_factory=lambda: collections.defaultdict(dict)) + + def __post_init__(self): + # Assemble the conversion dictionary: + for col in self.table.columns: + if col.csvw_type in TYPE_MAP: + conv = TYPE_MAP[col.csvw_type].convert + else: + conv = DATATYPES[col.csvw_type].to_python + self.converters[self.translate(self.name, col.name)] = (col.name, conv) + if col.separator: + if col.csvw_type == 'string': + self.separators[self.translate(self.name, col.name)] = col.separator + else: + self.separators[self.translate(self.name, col.name)] = 'json' + + def read_row(self, row: Iterator[tuple[str, Any]]) -> collections.OrderedDict[str, Any]: + """Read a table according to spec.""" + d = collections.OrderedDict() + for k, v in row: + if k in self.separators: + if v is None: + d[k] = None + elif not v: + d[k] = [] + elif self.separators[k] == 'json': + d[k] = json.loads(v) + else: + d[k] = [self.converters[k][1](v_) for v_ in (v or '').split(self.separators[k])] + else: + d[k] = self.converters[k][1](v) if v is not None else None + pk = d[self.translate(self.name, self.table.primary_key[0])] \ + if self.table.primary_key and len(self.table.primary_key) == 1 else None + d.update({k: [] for k in self.table.many_to_many}) + d.update(self.references.get(pk, {})) + return d diff --git a/src/csvw/dsv.py b/src/csvw/dsv.py index 0d050d2..858fc0d 100644 --- a/src/csvw/dsv.py +++ b/src/csvw/dsv.py @@ -15,12 +15,13 @@ import csv import codecs import shutil -import typing +from typing import Optional, Union, IO, Callable import pathlib import tempfile import warnings import functools import collections +from collections.abc import Iterable, Generator from . import utils from .dsv_dialects import Dialect @@ -32,14 +33,18 @@ 'rewrite', 'add_rows', 'filter_rows_as_dict', ] -LINES_OR_PATH = typing.Union[str, pathlib.Path, typing.IO, typing.Iterable[str]] +PathType = Union[str, pathlib.Path] +LinesOrPath = Union[PathType, IO, Iterable[str]] +# Note: The value for restkey is a list of all surplus column values. +DictRowType = collections.OrderedDict[str, Union[str, list[str]]] def normalize_encoding(encoding: str) -> str: + """Normalize the name of the encoding.""" return codecs.lookup(encoding).name -class UnicodeWriter: +class UnicodeWriter: # pylint: disable=too-many-instance-attributes """ Write Unicode data to a csv file. @@ -60,8 +65,8 @@ class UnicodeWriter: def __init__( self, - f: typing.Optional[typing.Union[str, pathlib.Path]] = None, - dialect: typing.Optional[typing.Union[Dialect, str]] = None, + f: Optional[PathType] = None, + dialect: Optional[Union[Dialect, str]] = None, **kw): self.f = f self.encoding = kw.pop('encoding', 'utf-8') @@ -89,6 +94,7 @@ def _escapedoubled(row): self._escapedoubled = _escapedoubled self._close = False self._rows_written = 0 + self.writer = None def __enter__(self): if isinstance(self.f, (str, pathlib.Path)): @@ -103,7 +109,7 @@ def __enter__(self): self.writer = csv.writer(self.f, **self.kw) return self - def read(self) -> typing.Optional[bytes]: + def read(self) -> Optional[bytes]: """ If the writer has been initialized passing `None` as target, the CSV data as `bytes` can be retrieved calling this method. @@ -112,16 +118,18 @@ def read(self) -> typing.Optional[bytes]: self.f.seek(0) if hasattr(self.f, 'read'): return self.f.read().encode('utf-8') + return None # pragma: no cover - def __exit__(self, type, value, traceback): + def __exit__(self, type_, value, traceback): if self._close: self.f.close() - def writerow(self, row: typing.Iterable[typing.Union[str, None]]): + def writerow(self, row: Iterable[Union[str, None]]): + """Write multiple rows.""" self.writer.writerow(self._escapedoubled(row)) self._rows_written += 1 - def writerows(self, rows: typing.Iterable[typing.Union[tuple, list, dict]]): + def writerows(self, rows: Iterable[Union[tuple, list, dict]]): """ Writes each row in `rows` formatted as CSV row. This behaves as [`csvwriter.writerows`](https://docs.python.org/3/library/csv.html#csv.csvwriter.writerows) @@ -141,7 +149,7 @@ def writerows(self, rows: typing.Iterable[typing.Union[tuple, list, dict]]): self.writerow(row) -class UnicodeReader: +class UnicodeReader: # pylint: disable=too-many-instance-attributes """ Read Unicode data from a csv file. @@ -164,13 +172,15 @@ class UnicodeReader: """ def __init__( self, - f: LINES_OR_PATH, - dialect: typing.Optional[typing.Union[Dialect, str]] = None, + f: LinesOrPath, + dialect: Optional[Union[Dialect, str]] = None, **kw): self.f = f self.encoding = normalize_encoding(kw.pop('encoding', 'utf-8-sig')) self.newline = kw.pop('lineterminator', None) self.dialect = dialect if isinstance(dialect, Dialect) else None + self.lineno = None + self.reader = None if self.dialect: self.encoding = self.dialect.python_encoding self.kw = dialect.as_python_formatting_parameters() @@ -215,7 +225,7 @@ def _next_row(self): row = [ s if isinstance(s, str) else s.decode(self._reader_encoding) for s in next(self.reader)] - self.lineno += sum([list(s).count('\n') for s in row]) + self.lineno += sum(list(s).count('\n') for s in row) return row def __next__(self): @@ -244,7 +254,7 @@ def __iter__(self): return self -class UnicodeReaderWithLineNumber(UnicodeReader): +class UnicodeReaderWithLineNumber(UnicodeReader): # pylint: disable=too-few-public-methods """ A `UnicodeReader` yielding (lineno, row) pairs, where "lineno" is the 1-based number of the the **text line** where the (possibly multi-line) row data starts in the DSV file. @@ -254,7 +264,7 @@ def __next__(self): :return: a pair (1-based line number in the input, row) """ # Retrieve the row, thereby incrementing the line number: - row = super(UnicodeReaderWithLineNumber, self).__next__() + row = super().__next__() return self.lineno + 1, row @@ -281,19 +291,26 @@ class UnicodeDictReader(UnicodeReader): """ - def __init__(self, f, fieldnames=None, restkey=None, restval=None, **kw): + def __init__( + self, + f, + fieldnames: Optional[list[str]] = None, + restkey: Optional[str] = None, + restval: Optional[str] = None, + **kw): self._fieldnames = fieldnames # list of keys for the dict self.restkey = restkey # key to catch long rows self.restval = restval # default value for short rows self.line_num = 0 - super(UnicodeDictReader, self).__init__(f, **kw) + super().__init__(f, **kw) @property - def fieldnames(self): + def fieldnames(self) -> Optional[list[str]]: + """Get the fieldnames, i.e. the dictionary keys for the rows.""" if self._fieldnames is None: - try: - self._fieldnames = super(UnicodeDictReader, self).__next__() - except StopIteration: + try: # Read the first row. + self._fieldnames = super().__next__() + except StopIteration: # No rows, so no fieldnames is ok. pass self.line_num = self.reader.line_num if self._fieldnames: @@ -301,21 +318,22 @@ def fieldnames(self): warnings.warn('Duplicate column names!') return self._fieldnames - def __next__(self) -> collections.OrderedDict: + def __next__(self) -> DictRowType: if self.line_num == 0: # Used only for its side effect. - self.fieldnames - row = super(UnicodeDictReader, self).__next__() + self.fieldnames # pylint: disable=pointless-statement + row = super().__next__() self.line_num = self.reader.line_num # unlike the basic reader, we prefer not to return blanks, # because we will typically wind up with a dict full of None # values while row == []: - row = super(UnicodeDictReader, self).__next__() + row = super().__next__() return self.item(row) - def item(self, row) -> collections.OrderedDict: + def item(self, row) -> DictRowType: + """Turn a row into a dict.""" d = collections.OrderedDict((k, v) for k, v in zip(self.fieldnames, row)) lf = len(self.fieldnames) lr = len(row) @@ -341,10 +359,14 @@ class NamedTupleReader(UnicodeDictReader): @functools.cached_property def cls(self): + """ + Creates a namedtuple class suitable for the columns of the CSV content. + """ fieldnames = list(map(self._normalize_fieldname, self.fieldnames)) return collections.namedtuple('Row', fieldnames) def item(self, row): + """Create a namedtuple from a row.""" d = UnicodeDictReader.item(self, row) for name in self.fieldnames: d.setdefault(name, None) @@ -352,11 +374,11 @@ def item(self, row): **{self._normalize_fieldname(k): v for k, v in d.items() if k in self.fieldnames}) -def iterrows(lines_or_file: LINES_OR_PATH, - namedtuples: typing.Optional[bool] = False, - dicts: typing.Optional[bool] = False, - encoding: typing.Optional[str] = 'utf-8', - **kw) -> typing.Generator: +def iterrows(lines_or_file: LinesOrPath, + namedtuples: Optional[bool] = False, + dicts: Optional[bool] = False, + encoding: Optional[str] = 'utf-8', + **kw) -> Generator: """Convenience factory function for csv reader. :param lines_or_file: Content to be read. Either a file handle, a file path or a list\ @@ -369,7 +391,7 @@ def iterrows(lines_or_file: LINES_OR_PATH, """ if namedtuples and dicts: raise ValueError('either namedtuples or dicts can be chosen as output format') - elif namedtuples: + if namedtuples: _reader = NamedTupleReader elif dicts: _reader = UnicodeDictReader @@ -377,16 +399,13 @@ def iterrows(lines_or_file: LINES_OR_PATH, _reader = UnicodeReader with _reader(lines_or_file, encoding=encoding, **kw) as r: - for item in r: - yield item + yield from r reader = iterrows -def rewrite(fname: typing.Union[str, pathlib.Path], - visitor: typing.Callable[[int, typing.List[str]], typing.Union[None, typing.List[str]]], - **kw): +def rewrite(fname: PathType, visitor: Callable[[int, list[str]], Union[None, list[str]]], **kw): """Utility function to rewrite rows in dsv files. :param fname: Path of the dsv file to operate on. @@ -394,7 +413,7 @@ def rewrite(fname: typing.Union[str, pathlib.Path], (modified) row or None to filter out the row. :param kw: Keyword parameters are passed through to csv.reader/csv.writer. """ - fname = utils.ensure_path(fname) + fname = pathlib.Path(fname) assert fname.is_file() with tempfile.NamedTemporaryFile(delete=False) as fp: tmp = pathlib.Path(fp.name) @@ -405,26 +424,27 @@ def rewrite(fname: typing.Union[str, pathlib.Path], row = visitor(i, row) if row is not None: writer.writerow(row) - shutil.move(str(tmp), str(fname)) # Path.replace is Python 3.3+ + shutil.move(tmp, fname) -def add_rows(fname: typing.Union[str, pathlib.Path], *rows: typing.List[str]): +def add_rows(fname: PathType, *rows: list[str]): + """ + Add rows to a CSV file. + """ with tempfile.NamedTemporaryFile(delete=False) as fp: tmp = pathlib.Path(fp.name) - fname = utils.ensure_path(fname) + fname = pathlib.Path(fname) with UnicodeWriter(tmp) as writer: if fname.exists(): with UnicodeReader(fname) as reader_: for row in reader_: writer.writerow(row) writer.writerows(rows) - shutil.move(str(tmp), str(fname)) # Path.replace is Python 3.3+ + shutil.move(tmp, fname) -def filter_rows_as_dict(fname: typing.Union[str, pathlib.Path], - filter_: typing.Callable[[dict], bool], - **kw) -> int: +def filter_rows_as_dict(fname: PathType, filter_: Callable[[dict], bool], **kw) -> int: """Rewrite a dsv file, filtering the rows. :param fname: Path to dsv file @@ -439,14 +459,14 @@ def filter_rows_as_dict(fname: typing.Union[str, pathlib.Path], return filter_.removed -class DictFilter(object): - - def __init__(self, filter_): - self.header = None +class DictFilter: # pylint: disable=R0903 + """Utility to apply a filter to a row as dict, while iterating over rows a list.""" + def __init__(self, filter_: Callable[[dict[str, str]], bool]): + self.header: Optional[list[str]] = None self.filter = filter_ - self.removed = 0 + self.removed: int = 0 - def __call__(self, i, row): + def __call__(self, i: int, row: list[str]) -> Optional[list[str]]: if i == 0: self.header = row return row @@ -454,5 +474,5 @@ def __call__(self, i, row): item = dict(zip(self.header, row)) if self.filter(item): return row - else: - self.removed += 1 + self.removed += 1 + return None diff --git a/src/csvw/dsv_dialects.py b/src/csvw/dsv_dialects.py index e35f391..76a17a2 100644 --- a/src/csvw/dsv_dialects.py +++ b/src/csvw/dsv_dialects.py @@ -9,11 +9,13 @@ - ``_ - ``_ """ -import attr +from typing import Callable, Literal import warnings import functools +import dataclasses from . import utils +from .metadata_utils import dataclass_asdict __all__ = ['Dialect'] @@ -22,131 +24,102 @@ } -# FIXME: replace with attrs.validators.ge(0) from attrs 21.3.0 -def _non_negative(instance, attribute, value): - if value < 0: # pragma: no cover - raise ValueError('{0} is not a valid {1}'.format(value, attribute.name)) - - -non_negative_int = [attr.validators.instance_of(int), _non_negative] - - def convert_encoding(s): - s = utils.converter(str, 'utf-8', s) + """We want to force utf-8 encoding, but accept diverse ways of specifying this :).""" + s = utils.type_checker(str, 'utf-8', s) try: _ = 'x'.encode(ENCODING_MAP.get(s, s)) return s except LookupError: - warnings.warn('Invalid value for property: {}'.format(s)) + warnings.warn(f'Invalid value for property: {s}') return 'utf-8' -@attr.s -class Dialect(object): +@dataclasses.dataclass +class Dialect: # pylint: disable=too-many-instance-attributes """ A CSV dialect specification. .. seealso:: ``_ """ - encoding = attr.ib( - default='utf-8', - converter=convert_encoding, - validator=attr.validators.instance_of(str)) - - lineTerminators = attr.ib( - converter=functools.partial(utils.converter, list, ['\r\n', '\n']), - default=attr.Factory(lambda: ['\r\n', '\n'])) - - quoteChar = attr.ib( - converter=functools.partial(utils.converter, str, '"', allow_none=True), - default='"', - ) - - doubleQuote = attr.ib( - default=True, - converter=functools.partial(utils.converter, bool, True), - validator=attr.validators.instance_of(bool)) - - skipRows = attr.ib( - default=0, - converter=functools.partial(utils.converter, int, 0, cond=lambda s: s >= 0), - validator=non_negative_int) - - commentPrefix = attr.ib( - default='#', - converter=functools.partial(utils.converter, str, '#', allow_none=True), - validator=attr.validators.optional(attr.validators.instance_of(str))) - - header = attr.ib( - default=True, - converter=functools.partial(utils.converter, bool, True), - validator=attr.validators.instance_of(bool)) - - headerRowCount = attr.ib( - default=1, - converter=functools.partial(utils.converter, int, 1, cond=lambda s: s >= 0), - validator=non_negative_int) - - delimiter = attr.ib( - default=',', - converter=functools.partial(utils.converter, str, ','), - validator=attr.validators.instance_of(str)) - - skipColumns = attr.ib( - default=0, - converter=functools.partial(utils.converter, int, 0, cond=lambda s: s >= 0), - validator=non_negative_int) - - skipBlankRows = attr.ib( - default=False, - converter=functools.partial(utils.converter, bool, False), - validator=attr.validators.instance_of(bool)) - - skipInitialSpace = attr.ib( - default=False, - converter=functools.partial(utils.converter, bool, False), - validator=attr.validators.instance_of(bool)) - - trim = attr.ib( - default='false', - validator=attr.validators.in_(['true', 'false', 'start', 'end']), - converter=lambda v: functools.partial( - utils.converter, - (str, bool), 'false')('{0}'.format(v).lower() if isinstance(v, bool) else v)) - - def updated(self, **kw): - res = self.__class__(**attr.asdict(self)) + encoding: str = 'utf-8' + lineTerminators: list[str] = dataclasses.field( # pylint: disable=invalid-name + default_factory=lambda: ['\r\n', '\n']) + quoteChar: str = '"' # pylint: disable=invalid-name + doubleQuote: bool = True # pylint: disable=invalid-name + skipRows: int = 0 # pylint: disable=invalid-name + commentPrefix: str = '#' # pylint: disable=invalid-name + header: bool = True + headerRowCount: int = 1 # pylint: disable=invalid-name + delimiter: str = ',' + skipColumns: int = 0 # pylint: disable=invalid-name + skipBlankRows: bool = False # pylint: disable=invalid-name + skipInitialSpace: bool = False # pylint: disable=invalid-name + trim: Literal['true', 'false', 'start', 'end'] = 'false' + + def __post_init__(self): + self.encoding = convert_encoding(self.encoding) + self.line_terminators = utils.type_checker(list, ['\r\n', '\n'], self.line_terminators) + self.quoteChar = utils.type_checker(str, '"', self.quoteChar, allow_none=True) + self.doubleQuote = utils.type_checker(bool, True, self.doubleQuote) + self.skipRows = utils.type_checker(int, 0, self.skipRows, cond=lambda s: s >= 0) + self.commentPrefix = utils.type_checker(str, '#', self.commentPrefix, allow_none=True) + self.header = utils.type_checker(bool, True, self.header) + self.headerRowCount = utils.type_checker( + int, 1, self.headerRowCount, cond=lambda s: s >= 0) + self.delimiter = utils.type_checker(str, ',', self.delimiter) + self.skipColumns = utils.type_checker(int, 0, self.skipColumns, cond=lambda s: s >= 0) + self.skipBlankRows = utils.type_checker(bool, False, self.skipBlankRows) + self.skipInitialSpace = utils.type_checker(bool, False, self.skipInitialSpace) + self.trim = utils.type_checker( + (str, bool), 'false', str(self.trim).lower() + if isinstance(self.trim, bool) else self.trim) + assert self.trim in ['true', 'false', 'start', 'end'], 'invalid trim' + + def updated(self, **kw) -> 'Dialect': + """Update the spec, returning a new updated object.""" + res = self.__class__(**dataclasses.asdict(self)) for k, v in kw.items(): setattr(res, k, v) return res @functools.cached_property - def escape_character(self): + def escape_character(self): # pylint: disable=C0116 return None if self.quoteChar is None else ('"' if self.doubleQuote else '\\') @functools.cached_property - def line_terminators(self): + def line_terminators(self) -> list[str]: # pylint: disable=C0116 return [self.lineTerminators] \ if isinstance(self.lineTerminators, str) else self.lineTerminators @functools.cached_property - def trimmer(self): + def trimmer(self) -> Callable[[str], str]: + """Map trim spec to a callable to do the trimming.""" return { + True: lambda s: s.strip(), 'true': lambda s: s.strip(), + False: lambda s: s, 'false': lambda s: s, 'start': lambda s: s.lstrip(), 'end': lambda s: s.rstrip() }[self.trim] def asdict(self, omit_defaults=True): - return utils.attr_asdict(self, omit_defaults=omit_defaults) + """The dialect spec as dict suitable for JSON serialization.""" + return dataclass_asdict(self, omit_defaults=omit_defaults) @property def python_encoding(self): + """ + Turn the encoding name into something understood by python. + """ return ENCODING_MAP.get(self.encoding, self.encoding) def as_python_formatting_parameters(self): + """ + Turn the dialect spec into a dict suitable as kwargs for Python's csv implementation. + """ return { 'delimiter': self.delimiter, 'doublequote': self.doubleQuote, diff --git a/src/csvw/frictionless.py b/src/csvw/frictionless.py index 3bd0200..23d4291 100644 --- a/src/csvw/frictionless.py +++ b/src/csvw/frictionless.py @@ -10,58 +10,66 @@ """ import json import pathlib +from typing import Any, TYPE_CHECKING + +if TYPE_CHECKING: + from csvw.metadata import TableGroup # pragma: no cover + + +def _convert_numeric_datatype(spec): + datatype = {'base': spec['type']} + if spec['type'] == 'string' and spec.get('format'): + datatype['dc:format'] = spec['format'] + if spec['type'] == 'boolean' and spec.get('trueValues') and spec.get('falseValues'): + datatype['format'] = f"{spec['trueValues'][0]}|{spec['falseValues'][0]}" + if spec['type'] in ['number', 'integer']: + if spec.get('bareNumber') is True: # pragma: no cover + raise NotImplementedError( + 'bareNumber is not supported in CSVW. It may be possible to translate to ' + 'a number pattern, though. See ' + 'https://www.w3.org/TR/2015/REC-tabular-data-model-20151217/' + '#formats-for-numeric-types') + if any(prop in spec for prop in ['decimalChar', 'groupChar']): + datatype['format'] = {} + for p in ['decimalChar', 'groupChar']: + if spec.get(p): + datatype['format'][p] = spec[p] + return datatype + + +def _convert_datatype(spec): # pylint: disable=too-many-return-statements + typemap = { + 'year': 'gYear', + 'yearmonth': 'gYearMonth', + } + if 'type' in spec: + if spec['type'] == 'string' and spec.get('format') == 'binary': + return {'base': 'binary'} + if spec['type'] == 'string' and spec.get('format') == 'uri': + return {'base': 'anyURI'} + if spec['type'] in typemap: + return {'base': typemap[spec['type']]} + if spec['type'] in [ + 'string', 'number', 'integer', 'boolean', 'date', 'time', 'datetime', 'duration', + ]: + return _convert_numeric_datatype(spec) + if spec['type'] in ['object', 'array']: + return {'base': 'json', 'dc:format': 'application/json'} + if spec['type'] == 'geojson': + return {'base': 'json', 'dc:format': 'application/geo+json'} + return {'base': 'string'} -def convert_column_spec(spec): +def convert_column_spec(spec: dict[str, Any]) -> dict[str, Any]: """ https://specs.frictionlessdata.io/table-schema/#field-descriptors :param spec: :return: """ - typemap = { - 'year': 'gYear', - 'yearmonth': 'gYearMonth', - } + res = {'name': spec['name'], 'datatype': _convert_datatype(spec)} titles = [t for t in [spec.get('title')] if t] - - res = {'name': spec['name'], 'datatype': {'base': 'string'}} - if 'type' in spec: - if spec['type'] == 'string' and spec.get('format') == 'binary': - res['datatype']['base'] = 'binary' - elif spec['type'] == 'string' and spec.get('format') == 'uri': - res['datatype']['base'] = 'anyURI' - elif spec['type'] in typemap: - res['datatype']['base'] = typemap[spec['type']] - elif spec['type'] in [ - 'string', 'number', 'integer', 'boolean', 'date', 'time', 'datetime', 'duration', - ]: - res['datatype']['base'] = spec['type'] - if spec['type'] == 'string' and spec.get('format'): - res['datatype']['dc:format'] = spec['format'] - if spec['type'] == 'boolean' and spec.get('trueValues') and spec.get('falseValues'): - res['datatype']['format'] = '{}|{}'.format( - spec['trueValues'][0], spec['falseValues'][0]) - if spec['type'] in ['number', 'integer']: - if spec.get('bareNumber') is True: # pragma: no cover - raise NotImplementedError( - 'bareNumber is not supported in CSVW. It may be possible to translate to ' - 'a number pattern, though. See ' - 'https://www.w3.org/TR/2015/REC-tabular-data-model-20151217/' - '#formats-for-numeric-types') - if any(prop in spec for prop in ['decimalChar', 'groupChar']): - res['datatype']['format'] = {} - for p in ['decimalChar', 'groupChar']: - if spec.get(p): - res['datatype']['format'][p] = spec[p] - elif spec['type'] in ['object', 'array']: - res['datatype']['base'] = 'json' - res['datatype']['dc:format'] = 'application/json' - elif spec['type'] == 'geojson': - res['datatype']['base'] = 'json' - res['datatype']['dc:format'] = 'application/geo+json' - if titles: res['titles'] = titles if 'description' in spec: @@ -75,24 +83,25 @@ def convert_column_spec(spec): res['datatype'][prop] = constraints[prop] if ('pattern' in constraints) and ('format' not in res['datatype']): res['datatype']['format'] = constraints['pattern'] - # FIXME: we could transform the "enum" constraint for string into + # We could transform the "enum" constraint for string into # a regular expression in the "format" property. return res -def convert_foreignKey(rsc_name, fk, resource_map): +def convert_foreignKey( # pylint: disable=C0103 + rsc_name: str, fk: dict, resource_map: dict) -> dict[str, Any]: """ https://specs.frictionlessdata.io/table-schema/#foreign-keys """ # Rename "fields" to "columnReference" and map resource name to url (resolving self-referential # foreign keys). - return dict( - columnReference=fk['fields'], - reference=dict( - columnReference=fk['reference']['fields'], - resource=resource_map[fk['reference']['resource'] or rsc_name], - ) - ) + return { + 'columnReference': fk['fields'], + 'reference': { + 'columnReference': fk['reference']['fields'], + 'resource': resource_map[fk['reference']['resource'] or rsc_name], + } + } def convert_table_schema(rsc_name, schema, resource_map): @@ -104,9 +113,7 @@ def convert_table_schema(rsc_name, schema, resource_map): key constraints. :return: `dict` suitable for instantiating a `csvw.metadata.Schema` object. """ - res = dict( - columns=[convert_column_spec(f) for f in schema['fields']], - ) + res = {'columns': [convert_column_spec(f) for f in schema['fields']]} for prop in [ ('missingValues', 'null'), 'primaryKey', @@ -152,7 +159,10 @@ def convert_dialect(rsc): return res -class DataPackage: +class DataPackage: # pylint: disable=R0903 + """ + Metadata according to the frictionless spec. + """ def __init__(self, spec, directory=None): if isinstance(spec, DataPackage): self.json = spec.json @@ -170,10 +180,8 @@ def __init__(self, spec, directory=None): self.json = spec - def to_tablegroup(self, cls=None): - from csvw import TableGroup - - md = {'@context': "http://www.w3.org/ns/csvw"} + def to_tablegroup(self, cls: type) -> 'TableGroup': # pylint: disable=C0116 + md: dict[str, Any] = {'@context': "http://www.w3.org/ns/csvw"} # Package metadata: md['dc:replaces'] = json.dumps(self.json) @@ -211,14 +219,13 @@ def to_tablegroup(self, cls=None): rsc.get('format') == 'csv': # Table Schema: md.setdefault('tables', []) - table = dict( - url=rsc['path'], - tableSchema=convert_table_schema(rsc.get('name'), schema, resource_map), - dialect=convert_dialect(rsc), - ) + table = { + 'url': rsc['path'], + 'tableSchema': convert_table_schema(rsc.get('name'), schema, resource_map), + 'dialect': convert_dialect(rsc), + } md['tables'].append(table) - cls = cls or TableGroup res = cls.fromvalue(md) - res._fname = self.dir / 'csvw-metadata.json' + res._fname = self.dir / 'csvw-metadata.json' # pylint: disable=W0212 return res diff --git a/src/csvw/jsonld.py b/src/csvw/jsonld.py index 92daa46..9238df8 100644 --- a/src/csvw/jsonld.py +++ b/src/csvw/jsonld.py @@ -1,23 +1,30 @@ +""" +Functionality to transform CSVW row values to RDF. +""" import re import json import math -import typing +from typing import TYPE_CHECKING, Any, Union import decimal import pathlib import datetime import collections +from collections.abc import Iterable +import dataclasses -import attr from rdflib import Graph, URIRef, Literal from rfc3986 import URIReference from isodate.duration import Duration from .utils import is_url +if TYPE_CHECKING: + from .metadata import Table, Column # pragma: no cover + __all__ = ['group_triples', 'to_json', 'Triple', 'format_value'] -def format_value(value, col): +def format_value(value: Any, col: 'Column') -> str: # pylint: disable=R0911 """ Format values as JSON-LD literals. """ @@ -29,66 +36,76 @@ def format_value(value, col): res = re.sub('T[0-9.:]+', '', res) if isinstance(value, (datetime.datetime, datetime.time)): stamp, _, milliseconds = res.partition('.') - return '{}.{}'.format(stamp, milliseconds.rstrip('0')) if milliseconds \ + return f'{stamp}.{milliseconds.rstrip("0")}' if milliseconds \ else stamp.replace('+00:00', 'Z') return res # pragma: no cover if isinstance(value, datetime.timedelta): return col.datatype.formatted(value) if isinstance(value, Duration): return col.datatype.formatted(value) - if isinstance(value, decimal.Decimal): - value = float(value) if isinstance(value, URIReference): return value.unsplit() if isinstance(value, bytes): return col.datatype.formatted(value) if isinstance(value, pathlib.Path): return str(value) + if isinstance(value, decimal.Decimal): + value = float(value) if isinstance(value, float): return 'NaN' if math.isnan(value) else ( - '{}INF'.format('-' if value < 0 else '') if math.isinf(value) else value) + f"{'-' if value < 0 else ''}INF" if math.isinf(value) else value) return value -@attr.s +@dataclasses.dataclass class Triple: """ A table cell's data as RDF triple. """ - about = attr.ib() - property = attr.ib() - value = attr.ib() + about: str + property: str + value: str - def as_rdflib_triple(self): + def as_rdflib_triple(self) -> tuple[URIRef, URIRef, Union[URIRef, Literal]]: + """The triple suitable for inclusion in an rdflib.Graph.""" return ( URIRef(self.about), URIRef(self.property), URIRef(self.value) if is_url(self.value) else Literal(self.value)) @classmethod - def from_col(cls, table, col, row, prop, val, rownum): + def from_col( # pylint: disable=R0913,R0917 + cls, + table: 'Table', + col: 'Column', + row: collections.OrderedDict[str, Any], + prop: str, + val: Any, + rownum: int, + ) -> 'Triple': """ - + Instantiate a triple from the data (and metadata) of a column value. """ _name = col.header if col else None - propertyUrl = col.propertyUrl if col else table.inherit('propertyUrl') + propertyUrl = col.propertyUrl if col \ + else table.inherit('propertyUrl') # pylint: disable=C0103 if propertyUrl: prop = table.expand(propertyUrl, row, _row=rownum, _name=_name, qname=True) is_type = prop == 'rdf:type' - valueUrl = col.valueUrl if col else table.inherit('valueUrl') + valueUrl = col.valueUrl if col else table.inherit('valueUrl') # pylint: disable=C0103 if valueUrl: val = table.expand(valueUrl, row, _row=rownum, _name=_name, qname=is_type) val = format_value(val, col) s = None - aboutUrl = col.aboutUrl if col else None + aboutUrl = col.aboutUrl if col else None # pylint: disable=invalid-name if aboutUrl: s = table.expand(aboutUrl, row, _row=rownum, _name=_name) or s return cls(about=s, property=prop, value=val) -def frame(data: list) -> list: +def frame(data: list[dict]) -> list: """ Inline referenced items to force a deterministic graph layout. @@ -131,13 +148,11 @@ def to_json(obj, flatten_list=False): return obj -def group_triples(triples: typing.Iterable[Triple]) -> typing.List[dict]: - """ - Group and frame triples into a `list` of JSON objects. - """ +def _merged_triples(triples: Iterable[Triple]) -> list[Triple]: merged = [] for triple in triples: if isinstance(triple.value, list): + # We check, whether a list-valued triple for the same property is already present. for t in merged: if t.property == triple.property and isinstance(t.value, list): t.value.extend(triple.value) @@ -146,25 +161,35 @@ def group_triples(triples: typing.Iterable[Triple]) -> typing.List[dict]: merged.append(triple) else: merged.append(triple) + return merged - grouped = collections.OrderedDict() - triples = [] - # First pass: get top-level properties. - for triple in merged: + +def _extract_grouped_triples(triples) -> tuple[collections.OrderedDict[str, Triple], list[Triple]]: + """Return triples grouped by property and purge these from `triples`.""" + grouped, rem = collections.OrderedDict(), [] + for triple in triples: if triple.about is None and triple.property == '@id': grouped[triple.property] = triple.value - else: - if not triple.about: - # For test48 - if triple.property in grouped: - if not isinstance(grouped[triple.property], list): - grouped[triple.property] = [grouped[triple.property]] - grouped[triple.property].append(triple.value) - else: - grouped[triple.property] = triple.value + continue + if not triple.about: + # For test48 + if triple.property in grouped: + if not isinstance(grouped[triple.property], list): + grouped[triple.property] = [grouped[triple.property]] + grouped[triple.property].append(triple.value) else: - triples.append(triple) - if not triples: + grouped[triple.property] = triple.value + continue + rem.append(triple) + return grouped, rem + + +def group_triples(triples: Iterable[Triple]) -> list[dict]: + """ + Group and frame triples into a `list` of JSON objects. + """ + grouped, triples = _extract_grouped_triples(_merged_triples(triples)) + if not triples: # All grouped. return [grouped] g = Graph() @@ -174,6 +199,7 @@ def group_triples(triples: typing.Iterable[Triple]) -> typing.List[dict]: for prop, val in grouped.items(): if prop != '@id': g.add(Triple(about=grouped['@id'], property=prop, value=val).as_rdflib_triple()) + res = g.serialize(format='json-ld') # Frame and simplify the resulting objects, augment with list index: res = [(i, to_json(v, flatten_list=True)) for i, v in enumerate(frame(json.loads(res)))] diff --git a/src/csvw/metadata.py b/src/csvw/metadata.py index 4d485e2..dc64544 100644 --- a/src/csvw/metadata.py +++ b/src/csvw/metadata.py @@ -1,5 +1,4 @@ -# metadata.py - +# pylint: disable=too-many-lines """Functionality to read and write metadata for CSV files. This module implements (partially) the W3C recommendation @@ -8,24 +7,26 @@ .. seealso:: https://www.w3.org/TR/tabular-metadata/ """ import io +import logging import re import json import shutil import decimal import pathlib -import typing +from typing import Optional, Union, Any, Literal, TypeVar import zipfile +import datetime import operator import warnings import functools import itertools import contextlib import collections +from collections.abc import Iterable, Generator +import dataclasses from urllib.parse import urljoin, urlparse, urlunparse from language_tags import tags -import attr -import requests import uritemplate from . import utils @@ -33,157 +34,33 @@ from .dsv import Dialect as BaseDialect, UnicodeReaderWithLineNumber, UnicodeWriter from .frictionless import DataPackage from . import jsonld +from .metadata_utils import DescriptionBase, dataclass_asdict, NAMESPACES, dialect_props, \ + valid_context_property DEFAULT = object() __all__ = [ - 'TableGroup', - 'Table', 'Column', 'ForeignKey', - 'Link', 'NaturalLanguage', - 'Datatype', - 'is_url', - 'CSVW', + 'TableGroup', 'Table', 'Column', 'ForeignKey', 'Link', 'NaturalLanguage', 'Datatype', + 'is_url', 'CSVW', ] -NAMESPACES = { - 'csvw': 'http://www.w3.org/ns/csvw#', - 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', - 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', - 'xsd': 'http://www.w3.org/2001/XMLSchema#', - 'dc': 'http://purl.org/dc/terms/', - 'dcat': 'http://www.w3.org/ns/dcat#', - 'prov': 'http://www.w3.org/ns/prov#', - 'schema': 'http://schema.org/', - "as": "https://www.w3.org/ns/activitystreams#", - "cc": "http://creativecommons.org/ns#", - "ctag": "http://commontag.org/ns#", - "dc11": "http://purl.org/dc/elements/1.1/", - "dctypes": "http://purl.org/dc/dcmitype/", - "dqv": "http://www.w3.org/ns/dqv#", - "duv": "https://www.w3.org/ns/duv#", - "foaf": "http://xmlns.com/foaf/0.1/", - "gr": "http://purl.org/goodrelations/v1#", - "grddl": "http://www.w3.org/2003/g/data-view#", - "ical": "http://www.w3.org/2002/12/cal/icaltzd#", - "jsonld": "http://www.w3.org/ns/json-ld#", - "ldp": "http://www.w3.org/ns/ldp#", - "ma": "http://www.w3.org/ns/ma-ont#", - "oa": "http://www.w3.org/ns/oa#", - "odrl": "http://www.w3.org/ns/odrl/2/", - "og": "http://ogp.me/ns#", - "org": "http://www.w3.org/ns/org#", - "owl": "http://www.w3.org/2002/07/owl#", - "qb": "http://purl.org/linked-data/cube#", - "rdfa": "http://www.w3.org/ns/rdfa#", - "rev": "http://purl.org/stuff/rev#", - "rif": "http://www.w3.org/2007/rif#", - "rr": "http://www.w3.org/ns/r2rml#", - "sd": "http://www.w3.org/ns/sparql-service-description#", - "sioc": "http://rdfs.org/sioc/ns#", - "skos": "http://www.w3.org/2004/02/skos/core#", - "skosxl": "http://www.w3.org/2008/05/skos-xl#", - "sosa": "http://www.w3.org/ns/sosa/", - "ssn": "http://www.w3.org/ns/ssn/", - "time": "http://www.w3.org/2006/time#", - "v": "http://rdf.data-vocabulary.org/#", - "vcard": "http://www.w3.org/2006/vcard/ns#", - "void": "http://rdfs.org/ns/void#", - "wdr": "http://www.w3.org/2007/05/powder#", - "wrds": "http://www.w3.org/2007/05/powder-s#", - "xhv": "http://www.w3.org/1999/xhtml/vocab#", - "xml": "http://www.w3.org/XML/1998/namespace", -} -CSVW_TERMS = """Cell -Column -Datatype -Dialect -Direction -ForeignKey -JSON -NumericFormat -Row -Schema -Table -TableGroup -TableReference -Transformation -aboutUrl -base -columnReference -columns -commentPrefix -datatype -decimalChar -default -delimiter -describes -dialect -doubleQuote -encoding -foreignKeys -format -groupChar -header -headerRowCount -json -lang -length -lineTerminators -maxExclusive -maxInclusive -maxLength -maximum -minExclusive -minInclusive -minLength -minimum -name -notes -null -ordered -pattern -primaryKey -propertyUrl -quoteChar -reference -referencedRows -required -resource -row -rowTitles -rownum -schemaReference -scriptFormat -separator -skipBlankRows -skipColumns -skipInitialSpace -skipRows -source -suppressOutput -tableDirection -tableSchema -tables -targetFormat -textDirection -titles -transformations -trim -uriTemplate -url -valueUrl -virtual""".split() is_url = utils.is_url +OrderedType = Union[ + int, float, decimal.Decimal, datetime.date, datetime.datetime, datetime.timedelta] +ColRefType = tuple[str] +RowType = collections.OrderedDict[str, Any] +T = TypeVar('T') + -class Invalid: +class Invalid: # pylint: disable=R0903,C0115: pass INVALID = Invalid() -@attr.s +@dataclasses.dataclass class Dialect(BaseDialect): """ The spec is ambiguous regarding a default for the commentPrefix property: @@ -204,76 +81,29 @@ class Dialect(BaseDialect): So, in order to pass the number formatting tests, with column names like `##.#`, we chose the second reading - i.e. by default no rows are treated as comments. """ - commentPrefix = attr.ib( - default=None, - converter=functools.partial(utils.converter, str, None, allow_none=True), - validator=attr.validators.optional(attr.validators.instance_of(str))) - - -def json_open(filename, mode='r', encoding='utf-8'): - assert encoding == 'utf-8' - return io.open(filename, mode, encoding=encoding) - - -def get_json(fname) -> typing.Union[list, dict]: - fname = str(fname) - if is_url(fname): - return requests.get(fname).json(object_pairs_hook=collections.OrderedDict) - with json_open(fname) as f: - return json.load(f, object_pairs_hook=collections.OrderedDict) - - -def log_or_raise(msg, log=None, level='warning', exception_cls=ValueError): - if log: - getattr(log, level)(msg) - else: - raise exception_cls(msg) - - -def nolog(level='warning'): - from types import MethodType - - class Log(object): - pass - - log = Log() - setattr(log, level, MethodType(lambda *args, **kw: None, log)) - return log + commentPrefix: str = None class URITemplate(uritemplate.URITemplate): - + """URITemplate properties support expansion, given suitable context.""" def __eq__(self, other): if isinstance(other, str): return self.uri == other if not hasattr(other, 'uri'): return False - return super(URITemplate, self).__eq__(other) - - def asdict(self, **kw): - return '{}'.format(self) + return super().__eq__(other) + def asdict(self, **_): # pylint: disable=C0116 + return f'{self}' -def uri_template_property(): - """ - - Note: We do not currently provide support for supplying the "_" variables like "_row" - when expanding a URI template. - .. seealso:: http://w3c.github.io/csvw/metadata/#uri-template-properties - """ - def converter_uriTemplate(v): - if v is None: - return None - if not isinstance(v, str): - warnings.warn('Invalid value for aboutUrl property') - return INVALID - return URITemplate(v) - - return attr.ib( - default=None, - validator=attr.validators.optional(attr.validators.instance_of((URITemplate, Invalid))), - converter=converter_uriTemplate) +def convert_uri_template(v): # pylint: disable=C0116 + if v is None: + return None # pragma: no cover + if not isinstance(v, str): + warnings.warn('Invalid value for Url property') + return INVALID + return URITemplate(v) class Link: @@ -282,19 +112,27 @@ class Link: .. seealso:: http://w3c.github.io/csvw/metadata/#link-properties """ - def __init__(self, string: typing.Union[str, pathlib.Path]): + def __init__(self, string: Union[str, pathlib.Path]): if not isinstance(string, (str, pathlib.Path)): raise ValueError('Invalid value for link property') self.string = string + @classmethod + def from_value(cls, v: Union['Link', str, pathlib.Path]): # pylint: disable=C0116 + if isinstance(v, Link): + return v # pragma: no cover + return cls(v) + def __str__(self): return self.string - def asdict(self, omit_defaults=True): + def asdict(self, **_): + """Not really a dict, but at least a JSON-serializable datatype.""" return self.string def __eq__(self, other): - # FIXME: Only naive, un-resolved comparison is supported at the moment. + # FIXME: pylint: disable=W0511 + # Only naive, un-resolved comparison is supported at the moment. return self.string == other.string if isinstance(other, Link) else False def resolve(self, base): @@ -312,21 +150,16 @@ def resolve(self, base): return urljoin(base, self.string) -def link_property(required=False): - return attr.ib( - default=None, - validator=attr.validators.optional(attr.validators.instance_of(Link)), - converter=lambda v: v if v is None else Link(v)) - - class NaturalLanguage(collections.OrderedDict): """ + A natural language property holds a collection of string, optionally categorized into languages. .. seealso:: http://w3c.github.io/csvw/metadata/#natural-language-properties """ - - def __init__(self, value): - super(NaturalLanguage, self).__init__() + def __init__( + self, + value: Union[str, list[str], tuple[str], dict[str, Union[str, list[str], tuple[str]]]]): + super().__init__() self.value = value if isinstance(self.value, str): self[None] = [self.value] @@ -351,7 +184,8 @@ def __init__(self, value): else: raise ValueError('invalid value type for NaturalLanguage') - def asdict(self, omit_defaults=True): + def asdict(self, **_): + """Serialize as dict.""" if list(self) == [None]: if len(self[None]) == 1: return self.getfirst() @@ -360,154 +194,22 @@ def asdict(self, omit_defaults=True): ('und' if k is None else k, v[0] if len(v) == 1 else v) for k, v in self.items()) - def add(self, string, lang=None): + def add(self, string: str, lang: Optional[str] = None) -> None: + """Add a string for a language.""" if lang not in self: self[lang] = [] self[lang].append(string) - def __str__(self): + def __str__(self) -> str: return self.getfirst() or next(iter(self.values()))[0] - def getfirst(self, lang=None): + def getfirst(self, lang: Optional[str] = None) -> Optional[str]: + """Return the first string specified for the given language tag.""" return self.get(lang, [None])[0] -def valid_id_property(v): - if not isinstance(v, str): - warnings.warn('Inconsistent link property') - return None - if v.startswith('_'): - raise ValueError('Invalid @id property: {}'.format(v)) - return v - - -def valid_common_property(v): - if isinstance(v, dict): - if not {k[1:] for k in v if k.startswith('@')}.issubset( - {'id', 'language', 'type', 'value'}): - raise ValueError( - "Aside from @value, @type, @language, and @id, the properties used on an object " - "MUST NOT start with @.") - if '@value' in v: - if len(v) > 1: - if len(v) > 2 \ - or set(v.keys()) not in [{'@value', '@language'}, {'@value', '@type'}] \ - or not isinstance(v['@value'], (str, bool, int, decimal.Decimal)): - raise ValueError( - "If a @value property is used on an object, that object MUST NOT have " - "any other properties aside from either @type or @language, and MUST " - "NOT have both @type and @language as properties. The value of the " - "@value property MUST be a string, number, or boolean value.") - if '@language' in v and '@value' not in v: - raise ValueError( - "A @language property MUST NOT be used on an object unless it also has a " - "@value property.") - if '@id' in v: - v['@id'] = valid_id_property(v['@id']) - if '@language' in v: - if not (isinstance(v['@language'], str) and tags.check(v['@language'])): - warnings.warn('Invalid language tag') - del v['@language'] - if '@type' in v: - vv = v['@type'] - if isinstance(vv, str): - if vv.startswith('_:'): - raise ValueError( - 'The value of any @id or @type contained within a metadata document ' - 'MUST NOT be a blank node.') - if not is_url(vv) and \ - not any(vv == ns or vv.startswith(ns + ':') for ns in NAMESPACES) and \ - vv not in CSVW_TERMS: - raise ValueError( - 'The value of any member of @type MUST be either a term defined in ' - '[csvw-context], a prefixed name where the prefix is a term defined in ' - '[csvw-context], or an absolute URL.') - elif not isinstance(vv, (list, dict)): - raise ValueError('Invalid datatype for @type') - return {k: valid_common_property(vv) for k, vv in v.items()} - if isinstance(v, list): - return [valid_common_property(vv) for vv in v] - return v - - -@attr.s -class DescriptionBase: - """Container for - - common properties (see http://w3c.github.io/csvw/metadata/#common-properties) - - @-properties. - """ - - common_props = attr.ib(default=attr.Factory(dict)) - at_props = attr.ib(default=attr.Factory(dict)) - - @classmethod - def partition_properties(cls, - d: typing.Union[dict, typing.Any], - type_name: typing.Optional[str] = None, - strict=True) -> typing.Union[dict, None]: - if d and not isinstance(d, dict): - return - fields = attr.fields_dict(cls) - type_name = type_name or cls.__name__ - c, a, dd = {}, {}, {} - for k, v in (d or {}).items(): - if k.startswith('@'): - if k == '@id': - v = valid_id_property(v) - if k == '@type' and v != type_name: - raise ValueError('Invalid @type property {} for {}'.format(v, type_name)) - a[k[1:]] = v - elif ':' in k: - c[k] = valid_common_property(v) - else: - if strict and (k not in fields): - warnings.warn('Invalid property {} for {}'.format(k, type_name)) - else: - dd[k] = v - return dict(common_props=c, at_props=a, **dd) - - @classmethod - def fromvalue(cls, d: dict): - return cls(**cls.partition_properties(d)) - - def _iter_dict_items(self, omit_defaults): - def _asdict_single(v): - return v.asdict(omit_defaults=omit_defaults) if hasattr(v, 'asdict') else v - - def _asdict_multiple(v): - if isinstance(v, (list, tuple)): - return [_asdict_single(vv) for vv in v] - return _asdict_single(v) - - for k, v in sorted(self.at_props.items()): - yield '@' + k, _asdict_multiple(v) - - for k, v in sorted(self.common_props.items()): - yield k, _asdict_multiple(v) - - for k, v in utils.attr_asdict(self, omit_defaults=omit_defaults).items(): - if k not in ('common_props', 'at_props'): - yield k, _asdict_multiple(v) - - def asdict(self, omit_defaults=True) -> dict: - # Note: The `null` property is the only inherited, list-valued property where the default - # is not the empty list. Thus, to allow setting it to empty, we must treat `null` as - # special case here. - # See also https://www.w3.org/TR/tabular-metadata/#dfn-inherited-property - return collections.OrderedDict( - (k, v) for k, v in self._iter_dict_items(omit_defaults) - if (k == 'null' or (v not in ([], {})))) - - -def optional_int(): - return attr.ib( - default=None, - validator=attr.validators.optional(attr.validators.instance_of(int)), - converter=lambda v: v if v is None else int(v)) - - -@attr.s -class Datatype(DescriptionBase): +@dataclasses.dataclass +class Datatype(DescriptionBase): # pylint: disable=too-many-instance-attributes """ A datatype description @@ -516,58 +218,39 @@ class Datatype(DescriptionBase): .. seealso:: ``_ """ - - base = attr.ib( - default=None, - converter=functools.partial( - utils.converter, - str, 'string', allow_none=True, cond=lambda ss: ss is None or ss in DATATYPES), - validator=attr.validators.optional(attr.validators.in_(DATATYPES))) - format = attr.ib(default=None) - length = optional_int() - minLength = optional_int() - maxLength = optional_int() - minimum = attr.ib(default=None) - maximum = attr.ib(default=None) - minInclusive = attr.ib(default=None) - maxInclusive = attr.ib(default=None) - minExclusive = attr.ib(default=None) - maxExclusive = attr.ib(default=None) - - @classmethod - def fromvalue(cls, v: typing.Union[str, dict, 'Datatype']) -> 'Datatype': - """ - :param v: Initialization data for `cls`; either a single string that is the main datatype \ - of the values of the cell or a datatype description object, i.e. a `dict` or a `cls` \ - instance. - :return: An instance of `cls` - """ - if isinstance(v, str): - return cls(base=v) - - if isinstance(v, dict): - v.setdefault('base', 'string') - return cls(**cls.partition_properties(v)) - - if isinstance(v, cls): - return v - - raise ValueError(v) - - def __attrs_post_init__(self): - for attr_ in [ - 'minimum', 'maximum', 'minInclusive', 'maxInclusive', 'minExclusive', 'maxExclusive' - ]: - if getattr(self, attr_) is not None: - setattr(self, attr_, self.parse(getattr(self, attr_))) + base: str = None + format: Optional[str] = None + length: Optional[int] = None + minLength: Optional[int] = None # pylint: disable=C0103 + maxLength: Optional[int] = None # pylint: disable=C0103 + minimum: OrderedType = None + maximum: OrderedType = None + minInclusive: Optional[bool] = None # pylint: disable=C0103 + maxInclusive: Optional[bool] = None # pylint: disable=C0103 + minExclusive: Optional[bool] = None # pylint: disable=C0103 + maxExclusive: Optional[bool] = None # pylint: disable=C0103 + + def __post_init__(self): + self.base = functools.partial( + utils.type_checker, + str, + 'string', + allow_none=True, + cond=lambda ss: ss is None or ss in DATATYPES)(self.base) + self._set_constraints() + self._validate_constraints() + + def _validate_constraints(self): + def error_if(msg, *conditions): + if any(conditions): + raise ValueError(msg) if self.length is not None: - if self.minLength is not None and self.length < self.minLength: - raise ValueError('minLength > length') - - if self.maxLength is not None: - if self.length > self.maxLength: - raise ValueError('maxLength < length') + error_if( + 'Length limits interfere', + self.minLength is not None and self.length < self.minLength, + self.maxLength is not None and self.length > self.maxLength, + ) if self.minLength is not None and self.maxLength is not None \ and self.minLength > self.maxLength: @@ -579,34 +262,35 @@ def __attrs_post_init__(self): if not isinstance( self.basetype(), tuple((DATATYPES[name] for name in ['decimal', 'float', 'datetime', 'duration']))): - if any([getattr(self, at) for at in - 'minimum maximum minExclusive maxExclusive minInclusive maxInclusive'.split()]): - raise ValueError( - 'Applications MUST raise an error if minimum, minInclusive, maximum, ' - 'maxInclusive, minExclusive, or maxExclusive are specified and the base ' - 'datatype is not a numeric, date/time, or duration type.') + error_if( + 'Applications MUST raise an error if minimum, minInclusive, maximum, ' + 'maxInclusive, minExclusive, or maxExclusive are specified and the base ' + 'datatype is not a numeric, date/time, or duration type.', + *[getattr(self, at) for at in + 'minimum maximum minExclusive maxExclusive minInclusive maxInclusive'.split()]) if not isinstance( self.basetype(), (DATATYPES['string'], DATATYPES['base64Binary'], DATATYPES['hexBinary'])): - if self.length or self.minLength or self.maxLength: - raise ValueError( - 'Applications MUST raise an error if length, maxLength, or minLength are ' - 'specified and the base datatype is not string or one of its subtypes, or a ' - 'binary type.') - - if (self.minInclusive and self.minExclusive) or (self.maxInclusive and self.maxExclusive): - raise ValueError( - 'Applications MUST raise an error if both minInclusive and minExclusive are ' - 'specified, or if both maxInclusive and maxExclusive are specified.') - - if (self.minInclusive and self.maxExclusive and self.maxExclusive <= self.minInclusive) or \ - (self.minInclusive and self.maxInclusive and self.maxInclusive < self.minInclusive): - raise ValueError('') - - if (self.minExclusive and self.maxExclusive and self.maxExclusive <= self.minExclusive) or ( - self.minExclusive and self.maxInclusive and self.maxInclusive <= self.minExclusive): - raise ValueError('') + error_if( + 'Applications MUST raise an error if length, maxLength, or minLength are ' + 'specified and the base datatype is not string or one of its subtypes, or a ' + 'binary type.', + self.length, self.minLength, self.maxLength) + + error_if( + 'Applications MUST raise an error if both minInclusive and minExclusive are ' + 'specified, or if both maxInclusive and maxExclusive are specified.', + self.minInclusive and self.minExclusive, + self.maxInclusive and self.maxExclusive, + ) + error_if( + 'Limits overlap', + self.minInclusive and self.maxExclusive and self.maxExclusive <= self.minInclusive, + self.minInclusive and self.maxInclusive and self.maxInclusive < self.minInclusive, + self.minExclusive and self.maxExclusive and self.maxExclusive <= self.minExclusive, + self.minExclusive and self.maxInclusive and self.maxInclusive <= self.minExclusive, + ) if 'id' in self.at_props and any( self.at_props['id'] == NAMESPACES['xsd'] + dt for dt in DATATYPES): @@ -618,7 +302,37 @@ def __attrs_post_init__(self): self.format = None warnings.warn('Invalid number pattern') - def asdict(self, omit_defaults=True): + def _set_constraints(self): + for att in ('length', 'maxLength', 'minLength'): + setattr(self, att, utils.optcast(int)(getattr(self, att))) + for attr_ in [ + 'minimum', 'maximum', 'minInclusive', 'maxInclusive', 'minExclusive', 'maxExclusive' + ]: + if getattr(self, attr_) is not None: + setattr(self, attr_, self.parse(getattr(self, attr_))) + + @classmethod + def fromvalue(cls, d: Union[str, dict, 'Datatype']) -> 'Datatype': + """ + :param v: Initialization data for `cls`; either a single string that is the main datatype \ + of the values of the cell or a datatype description object, i.e. a `dict` or a `cls` \ + instance. + :return: An instance of `cls` + """ + if isinstance(d, str): + return cls(base=d) + + if isinstance(d, dict): + d.setdefault('base', 'string') + return cls(**cls.partition_properties(d)) + + if isinstance(d, cls): + return d + + raise ValueError(d) + + def asdict(self, omit_defaults=True) -> dict: + """The datatype serialized as dict suitable for conversion to JSON.""" res = DescriptionBase.asdict(self, omit_defaults=omit_defaults) for attr_ in [ 'minimum', 'maximum', 'minInclusive', 'maxInclusive', 'minExclusive', 'maxExclusive' @@ -630,70 +344,59 @@ def asdict(self, omit_defaults=True): return res @property - def basetype(self): + def basetype(self) -> type: # pylint: disable=C0116 return DATATYPES[self.base] @property - def derived_description(self): + def derived_description(self) -> dict: # pylint: disable=C0116 return self.basetype.derived_description(self) - def formatted(self, v): + def formatted(self, v: Any) -> str: + """Format a value as string.""" return self.basetype.to_string(v, **self.derived_description) - def parse(self, v): + def parse(self, v: str) -> Any: + """Parse a string value into a Python type.""" if v is None: return v return self.basetype.to_python(v, **self.derived_description) - def validate(self, v): + def validate(self, v: T) -> T: + """Make sure the datatype-level constraints are met.""" if v is None: return v try: l_ = len(v or '') if self.length is not None and l_ != self.length: - raise ValueError('value must have length {}'.format(self.length)) + raise ValueError(f'value must have length {self.length}') if self.minLength is not None and l_ < self.minLength: - raise ValueError('value must have at least length {}'.format(self.minLength)) + raise ValueError(f'value must have at least length {self.minLength}') if self.maxLength is not None and l_ > self.maxLength: - raise ValueError('value must have at most length {}'.format(self.maxLength)) + raise ValueError(f'value must have at most length {self.maxLength}') except TypeError: pass if self.basetype.minmax: if self.minimum is not None and v < self.minimum: - raise ValueError('value must be >= {}'.format(self.minimum)) + raise ValueError(f'value must be >= {self.minimum}') if self.minInclusive is not None and v < self.minInclusive: - raise ValueError('value must be >= {}'.format(self.minInclusive)) + raise ValueError(f'value must be >= {self.minInclusive}') if self.minExclusive is not None and v <= self.minExclusive: - raise ValueError('value must be > {}'.format(self.minExclusive)) + raise ValueError(f'value must be > {self.minExclusive}') if self.maximum is not None and v > self.maximum: - raise ValueError('value must be <= {}'.format(self.maximum)) + raise ValueError(f'value must be <= {self.maximum}') if self.maxInclusive is not None and v > self.maxInclusive: - raise ValueError('value must be <= {}'.format(self.maxInclusive)) + raise ValueError(f'value must be <= {self.maxInclusive}') if self.maxExclusive is not None and v >= self.maxExclusive: - raise ValueError('value must be < {}'.format(self.maxExclusive)) + raise ValueError(f'value must be < {self.maxExclusive}') return v - def read(self, v): + def read(self, v: str) -> Any: + """Read a value according to the spec of the Datatype.""" return self.validate(self.parse(v)) -def converter_null(v): - res = [] if v is None else (v if isinstance(v, list) else [v]) - if not all(isinstance(vv, str) for vv in res): - warnings.warn('Invalid null property') - return [""] - return res - - -def converter_lang(v): - if not tags.check(v): - warnings.warn('Invalid language tag') - return 'und' - return v - - -@attr.s -class Description(DescriptionBase): +@dataclasses.dataclass +class Description(DescriptionBase): # pylint: disable=R0902 """Adds support for inherited properties. .. seealso:: http://w3c.github.io/csvw/metadata/#inherited-properties @@ -703,59 +406,69 @@ class Description(DescriptionBase): # reference to the containing object. Note that this attribute is ignored when judging # equality between objects. Thus, identically specified columns of different tables will be # considered equal. - _parent = attr.ib(default=None, repr=False, eq=False) - - aboutUrl = uri_template_property() - datatype = attr.ib( - default=None, - converter=lambda v: v if not v else Datatype.fromvalue(v)) - default = attr.ib( - default="", - converter=functools.partial(utils.converter, str, "", allow_list=False), - ) - lang = attr.ib(default="und", converter=converter_lang) - null = attr.ib(default=attr.Factory(lambda: [""]), converter=converter_null) - ordered = attr.ib( - default=None, - converter=functools.partial(utils.converter, bool, False, allow_none=True), - ) - propertyUrl = uri_template_property() - required = attr.ib(default=None) - separator = attr.ib( - converter=functools.partial(utils.converter, str, None, allow_none=True), - default=None, - ) - textDirection = attr.ib( - default=None, - converter=functools.partial( - utils.converter, - str, None, allow_none=True, cond=lambda v: v in [None, "ltr", "rtl", "auto", "inherit"]) - ) - valueUrl = uri_template_property() - - def inherit(self, attr): + _parent: Optional[DescriptionBase] = None + + aboutUrl: Optional[Union[URITemplate, Invalid]] = None # pylint: disable=C0103 + datatype: Optional[Datatype] = None + default: Optional[Union[str, list[str]]] = "" + lang: str = "und" + null: list[str] = dataclasses.field(default_factory=lambda: [""]) + ordered: Optional[bool] = None + propertyUrl: Optional[Union[URITemplate, Invalid]] = None # pylint: disable=C0103 + required: Optional[bool] = None + separator: Optional[str] = None + textDirection: Optional[ # pylint: disable=C0103 + Literal["ltr", "rtl", "auto", "inherit"]] = None + valueUrl: Optional[Union[URITemplate, Invalid]] = None # pylint: disable=C0103 + + def __post_init__(self): + if self.datatype is not None: + self.datatype = Datatype.fromvalue(self.datatype) + self.default = utils.type_checker(str, "", self.default, allow_list=False) + if not tags.check(self.lang): + warnings.warn('Invalid language tag') + self.lang = 'und' + + self.null = [] if self.null is None else \ + (self.null if isinstance(self.null, list) else [self.null]) + if not all(isinstance(vv, str) for vv in self.null): + warnings.warn('Invalid null property') + self.null = [""] + self.ordered = utils.type_checker(bool, False, self.ordered, allow_none=True) + self.separator = utils.type_checker(str, None, self.separator, allow_none=True) + self.textDirection = utils.type_checker( + str, + None, + self.textDirection, + allow_none=True, + cond=lambda v: v in [None, "ltr", "rtl", "auto", "inherit"]) + for att in ('valueUrl', 'aboutUrl', 'propertyUrl'): + if getattr(self, att) is not None: + setattr(self, att, convert_uri_template(getattr(self, att))) + + def inherit(self, attr) -> Optional[Any]: + """ + The implementation of the inheritance mechanism. + + The chain of inheritance is established by assigning a an object to `_parent`. If this + object has a method `inherit` as well (i.e. is derived from Description), the chain + may continue. + """ v = getattr(self, attr) if v is None and self._parent: return self._parent.inherit(attr) if hasattr(self._parent, 'inherit') \ else getattr(self._parent, attr) return v - def inherit_null(self): + def inherit_null(self) -> list[str]: + """Inheritance of null is a special case due to the default value not being None.""" if self.null == [""]: if self._parent and hasattr(self._parent, 'inherit_null'): return self._parent.inherit_null() return self.null -def converter_titles(v): - try: - return v if v is None else NaturalLanguage(v) - except ValueError: - warnings.warn('Invalid titles property') - return None - - -@attr.s +@dataclasses.dataclass class Column(Description): """ A column description is an object that describes a single column. @@ -766,26 +479,38 @@ class Column(Description): .. seealso:: ``_ """ - name = attr.ib( - default=None, - converter=functools.partial(utils.converter, str, None, allow_none=True) - ) - suppressOutput = attr.ib( - default=False, - converter=functools.partial(utils.converter, bool, False)) - titles = attr.ib( - default=None, - validator=attr.validators.optional(attr.validators.instance_of(NaturalLanguage)), - converter=converter_titles) - virtual = attr.ib(default=False, converter=functools.partial(utils.converter, bool, False)) - _number = attr.ib(default=None, repr=False) + name: str = None + suppressOutput: bool = False # pylint: disable=C0103 + titles: Optional[NaturalLanguage] = None + virtual: bool = False + _number: Optional[int] = None + + def __post_init__(self): + super().__post_init__() + self.name = utils.type_checker(str, None, self.name, allow_none=True) + self.suppressOutput = utils.type_checker(bool, False, self.suppressOutput) + + if self.titles is not None: + try: + self.titles = NaturalLanguage(self.titles) + except ValueError: + warnings.warn('Invalid titles property') + self.titles = None + + self.virtual = utils.type_checker(bool, False, self.virtual) def __str__(self): - return self.name or \ - (self.titles and self.titles.getfirst()) or \ - '_col.{}'.format(self._number) + return self.name or (self.titles and self.titles.getfirst()) or f'_col.{self._number}' - def has_title(self, v): + def __eq__(self, other): + return self.asdict() == other.asdict() + + def has_title(self, v) -> Union[str, bool]: + """ + Check whether the name or a title of the column matches v. + + If v matches a title, the associated language tag (or 'und') is returned. + """ if self.name and self.name == v: return True for tag, titles in (self.titles or {}).items(): @@ -794,10 +519,11 @@ def has_title(self, v): return False @property - def header(self): - return '{}'.format(self) + def header(self) -> str: # pylint: disable=missing-function-docstring + return f'{self}' - def read(self, v, strict=True): + def read(self, v: str, strict=True) -> Any: + """Convert a str to a Python object according to the spec for the column.""" required = self.inherit('required') null = self.inherit_null() default = self.inherit('default') @@ -812,18 +538,18 @@ def read(self, v, strict=True): warnings.warn('required column value is missing') raise ValueError('required column value is missing') - if separator: + if separator: # A list-valued column. if not v: - v = [] + v = [] # Empty string is interpreted as empty list. elif v in null: - v = None + v = None # A null value is interpreted as missing data. else: v = (vv or default for vv in v.split(separator)) v = [None if vv in null else vv for vv in v] elif v in null: - v = None + v = None # A null value. - if datatype: + if datatype: # Apply datatype conversion. if isinstance(v, list): try: return [datatype.read(vv) for vv in v] @@ -835,7 +561,8 @@ def read(self, v, strict=True): return datatype.read(v) return v - def write(self, v): + def write(self, v: Any) -> str: + """Convert v to a string according to the specifications for the column.""" sep = self.inherit('separator') null = self.inherit_null() datatype = self.inherit('datatype') @@ -852,60 +579,57 @@ def fmt(v): return fmt(v) -def column_reference(): - return attr.ib( - default=None, - validator=attr.validators.optional(attr.validators.instance_of(list)), - converter=lambda v: v if isinstance(v, list) or v is None else [v]) - - -@attr.s +@dataclasses.dataclass class Reference: + """A reference specification as used to describe the targets of foreign keys.""" + resource: Optional[Link] = None + schemaReference: Optional[Link] = None # pylint: disable=C0103 + columnReference: Optional[list[str]] = None # pylint: disable=C0103 - resource = link_property() - schemaReference = link_property() - columnReference = column_reference() + def __post_init__(self): + if self.resource is not None: + if self.schemaReference is not None: + # Either a local resource may be referenced or a schema - not both. + raise ValueError(self) + self.resource = Link.from_value(self.resource) - def __attrs_post_init__(self): - if self.resource is not None and self.schemaReference is not None: - raise ValueError(self) + if self.schemaReference is not None: + self.schemaReference = Link.from_value(self.schemaReference) + if isinstance(self.columnReference, str): + self.columnReference = [self.columnReference] -@attr.s + +@dataclasses.dataclass class ForeignKey: + """A specification of a foreign key.""" + columnReference: Optional[list[str]] = None # pylint: disable=C0103 + reference: Optional[Reference] = None - columnReference = column_reference() - reference = attr.ib(default=None) + def __post_init__(self): + if isinstance(self.columnReference, str): + self.columnReference = [self.columnReference] @classmethod def fromdict(cls, d): + """Instantiate an object from a dict as returned by parsing the JSON metadata.""" if isinstance(d, dict): try: _ = Reference(**d['reference']) - except TypeError: - raise ValueError('Invalid reference property') + except TypeError as e: + raise ValueError('Invalid reference property') from e if not set(d.keys()).issubset({'columnReference', 'reference'}): raise ValueError('Invalid foreignKey spec') kw = dict(d, reference=Reference(**d['reference'])) return cls(**kw) - def asdict(self, **kw): - res = utils.attr_asdict(self, **kw) - res['reference'] = utils.attr_asdict(res['reference'], **kw) + def asdict(self, **kw) -> dict[str, Any]: # pylint: disable=C0116 + res = dataclass_asdict(self, **kw) + res['reference'] = dataclass_asdict(res['reference'], **kw) return res -def converter_foreignKeys(v): - res = [] - for d in functools.partial(utils.converter, dict, None)(v): - try: - res.append(ForeignKey.fromdict(d)) - except TypeError: - warnings.warn('Invalid foreignKeys spec') - return res - - -@attr.s +@dataclasses.dataclass class Schema(Description): """ A schema description is an object that encodes the information about a schema, which describes @@ -916,66 +640,84 @@ class Schema(Description): .. seealso:: ``_ """ - columns = attr.ib( - default=attr.Factory(list), - converter=lambda v: [ - Column.fromvalue(c) for c in functools.partial(utils.converter, dict, None)( - functools.partial(utils.converter, list, [])(v))]) - foreignKeys = attr.ib( - default=attr.Factory(list), - converter=lambda v: [] if v is None else converter_foreignKeys(v)) - primaryKey = column_reference() - rowTitles = attr.ib( - default=attr.Factory(list), - converter=lambda v: v if isinstance(v, list) else [v], - ) - - def __attrs_post_init__(self): + columns: list[Column] = dataclasses.field(default_factory=list) + foreignKeys: list[ForeignKey] = dataclasses.field(default_factory=list) # pylint: disable=C0103 + primaryKey: Optional[list[str]] = None # pylint: disable=C0103 + rowTitles: list[str] = dataclasses.field(default_factory=list) # pylint: disable=C0103 + + def __post_init__(self): + super().__post_init__() + self.columns = [ + Column.fromvalue(c) for c in + utils.type_checker(dict, None, utils.type_checker(list, [], self.columns))] + for i, col in enumerate(self.columns): + col._number = i + 1 # pylint: disable=protected-access + if self.foreignKeys is None: + self.foreignKeys = [] # pragma: no cover + else: + res = [] + for d in utils.type_checker(dict, None, self.foreignKeys): + try: + res.append(ForeignKey.fromdict(d)) + except TypeError: + warnings.warn('Invalid foreignKeys spec') + self.foreignKeys = res + + if self.primaryKey is not None and not isinstance(self.primaryKey, list): + self.primaryKey = [self.primaryKey] + self.rowTitles = self.rowTitles if isinstance(self.rowTitles, list) else [self.rowTitles] + virtual, seen, names = False, set(), set() for i, col in enumerate(self.columns): - if col.name and (col.name.startswith('_') or re.search(r'\s', col.name)): - warnings.warn('Invalid column name') - if col.virtual: # first virtual column sets the flag - virtual = True - elif virtual: # non-virtual column after virtual column! - raise ValueError('no non-virtual column allowed after virtual columns') - if not virtual: - if col.header in seen: - warnings.warn('Duplicate column name!') - if col.name: - if col.name in names: - raise ValueError('Duplicate column name {}'.format(col.name)) - names.add(col.name) - seen.add(col.header) - col._parent = self - col._number = i + 1 + virtual = self._check_col(col, virtual, names, seen) + col._parent = self # pylint: disable=protected-access for colref in self.primaryKey or []: col = self.columndict.get(colref) if col and not col.name: warnings.warn('A primaryKey referenced column MUST have a `name` property') self.primaryKey = None + def _check_col(self, col, virtual: bool, names: set[str], seen: set[str]) -> bool: + if col.name and (col.name.startswith('_') or re.search(r'\s', col.name)): + warnings.warn('Invalid column name') + if col.virtual: # first virtual column sets the flag + virtual = True + elif virtual: # non-virtual column after virtual column! + raise ValueError('no non-virtual column allowed after virtual columns') + if not virtual: + if col.header in seen: + warnings.warn('Duplicate column name!') + if col.name: + if col.name in names: + raise ValueError(f'Duplicate column name {col.name}') + names.add(col.name) + seen.add(col.header) + return virtual + @classmethod - def fromvalue(cls, v): - if isinstance(v, str): + def fromvalue(cls, d: Union[dict, str]) -> 'Schema': + """Instantiate a Schema from a dict or a URL to a JSON file.""" + if isinstance(d, str): try: # The schema is referenced with a URL - v = requests.get(v).json() - except: # pragma: no cover # noqa: E722 - return v - if not isinstance(v, dict): - if isinstance(v, int): + d = utils.request_get(d).json() + except: # pragma: no cover # noqa: E722 # pylint: disable=W0702 + return d + if not isinstance(d, dict): + if isinstance(d, int): warnings.warn('Invalid value for tableSchema property') - v = {} - return cls(**cls.partition_properties(v)) + d = {} + return cls(**cls.partition_properties(d)) @property - def columndict(self): + def columndict(self) -> dict[str, Column]: + """A table's columns mapped by header, i.e. normalized name.""" return {c.header: c for c in self.columns} - def get_column(self, name, strict=False): + def get_column(self, name: str, strict: bool = False) -> Optional[Column]: + """Resolve a Column by name, titles or propertyUrl.""" col = self.columndict.get(name) - assert (not strict) or (col and col.name) + assert (not strict) or (col and col.name), name if not col: for c in self.columns: if c.titles and c.titles.getfirst() == name: @@ -985,26 +727,7 @@ def get_column(self, name, strict=False): return col -def dialect_props(d): - if not isinstance(d, dict): - warnings.warn('Invalid dialect spec') - return {} - partitioned = Description.partition_properties(d, type_name='Dialect', strict=False) - del partitioned['at_props'] - del partitioned['common_props'] - if partitioned.get('headerRowCount'): - partitioned['header'] = True - return partitioned - - -def valid_transformations(instance, attribute, value): - if not isinstance(value, list): - warnings.warn('Invalid transformations property') - for tr in value: - Description.partition_properties(tr, type_name='Template') - - -@attr.s +@dataclasses.dataclass class TableLike(Description): """ A CSVW description object as encountered "in the wild", i.e. identified by URL on the web or @@ -1031,60 +754,50 @@ class TableLike(Description): and `URI template properties `_ (see :meth:`~TableLike.expand`). """ - dialect = attr.ib( - default=None, - converter=lambda v: v if (v is None or isinstance(v, str)) - else Dialect(**dialect_props(v))) - notes = attr.ib(default=attr.Factory(list)) - tableDirection = attr.ib( - default='auto', - converter=functools.partial( - utils.converter, str, 'auto', cond=lambda s: s in ['rtl', 'ltr', 'auto']), - validator=attr.validators.in_(['rtl', 'ltr', 'auto'])) - tableSchema = attr.ib( - default=None, - converter=lambda v: Schema.fromvalue(v)) - transformations = attr.ib( - validator=valid_transformations, - default=attr.Factory(list), - ) - url = link_property() - _fname = attr.ib(default=None) # The path of the metadata file. - - def __attrs_post_init__(self): + dialect: Optional[Union[str, Dialect]] = None + notes: list[str] = dataclasses.field(default_factory=list) + tableDirection: Literal['rtl', 'ltr', 'auto'] = 'auto' # pylint: disable=invalid-name + tableSchema: Optional[Schema] = None # pylint: disable=invalid-name + transformations: list = dataclasses.field(default_factory=list) + url: Optional[Link] = None + _fname: Union[str, pathlib.Path] = None # The path of the metadata file. + + def __post_init__(self): + super().__post_init__() if isinstance(self.dialect, str): - self.dialect = Dialect(**dialect_props(get_json(Link(self.dialect).resolve(self.base)))) - if self.tableSchema and not (isinstance(self.tableSchema, str)): - self.tableSchema._parent = self + self.dialect = Dialect( + **dialect_props(utils.get_json(Link(self.dialect).resolve(self.base)))) + elif self.dialect is not None: + self.dialect = Dialect(**dialect_props(self.dialect)) + + self.tableDirection = utils.type_checker( + str, 'auto', self.tableDirection, cond=lambda s: s in ['rtl', 'ltr', 'auto']) + self.tableSchema = Schema.fromvalue(self.tableSchema) + + if not isinstance(self.transformations, list): + warnings.warn('Invalid transformations property') + for tr in self.transformations: + DescriptionBase.partition_properties(tr, type_name='Template') + if self.url is not None: + self.url = Link(self.url) + + if self.tableSchema and not isinstance(self.tableSchema, str): + self.tableSchema._parent = self # pylint: disable=protected-access if 'id' in self.at_props and self.at_props['id'] is None: self.at_props['id'] = self.base - ctx = self.at_props.get('context') - if isinstance(ctx, list): - for obj in ctx: - if (isinstance(obj, dict) and not set(obj.keys()).issubset({'@base', '@language'}))\ - or (isinstance(obj, str) and obj != 'http://www.w3.org/ns/csvw'): - raise ValueError( - 'The @context MUST have one of the following values: An array composed ' - 'of a string followed by an object, where the string is ' - 'http://www.w3.org/ns/csvw and the object represents a local context ' - 'definition, which is restricted to contain either or both of' - '@base and @language.') - if isinstance(obj, dict) and '@language' in obj: - if not tags.check(obj['@language']): - warnings.warn('Invalid value for @language property') - del obj['@language'] - - def get_column(self, spec): + valid_context_property(self.at_props.get('context')) + + def get_column(self, spec: str) -> Optional[Column]: # pylint: disable=C0116 return self.tableSchema.get_column(spec) if self.tableSchema else None @classmethod - def from_file(cls, fname: typing.Union[str, pathlib.Path], data=None) -> 'TableLike': + def from_file(cls, fname: Union[str, pathlib.Path], data=None) -> 'TableLike': """ Instantiate a CSVW Table or TableGroup description from a metadata file. """ if is_url(str(fname)): return cls.from_url(str(fname), data=data) - res = cls.fromvalue(data or get_json(fname)) + res = cls.fromvalue(data or utils.get_json(fname)) res._fname = pathlib.Path(fname) return res @@ -1093,7 +806,7 @@ def from_url(cls, url: str, data=None) -> 'TableLike': """ Instantiate a CSVW Table or TableGroup description from a metadata file specified by URL. """ - data = data or get_json(url) + data = data or utils.get_json(url) url = urlparse(url) data.setdefault('@base', urlunparse((url.scheme, url.netloc, url.path, '', '', ''))) for table in data.get('tables', [data]): @@ -1102,7 +815,7 @@ def from_url(cls, url: str, data=None) -> 'TableLike': res = cls.fromvalue(data) return res - def to_file(self, fname: typing.Union[str, pathlib.Path], omit_defaults=True) -> pathlib.Path: + def to_file(self, fname: Union[str, pathlib.Path], omit_defaults=True) -> pathlib.Path: """ Write a CSVW Table or TableGroup description as JSON object to a local file. @@ -1110,14 +823,14 @@ def to_file(self, fname: typing.Union[str, pathlib.Path], omit_defaults=True) -> description objects. If `omit_defaults==True`, these properties will be pruned from \ the JSON object. """ - fname = utils.ensure_path(fname) + fname = pathlib.Path(fname) data = self.asdict(omit_defaults=omit_defaults) - with json_open(str(fname), 'w') as f: + with utils.json_open(str(fname), 'w') as f: json.dump(data, f, indent=4, separators=(',', ': ')) return fname @property - def base(self) -> typing.Union[str, pathlib.Path]: + def base(self) -> Union[str, pathlib.Path]: """ The "base" to resolve relative links against. """ @@ -1133,8 +846,9 @@ def base(self) -> typing.Union[str, pathlib.Path]: # **base URL** for other URLs in the metadata document. return Link(ctxbase).resolve(at_props['base']) return at_props['base'] - return self._parent._fname.parent if (self._parent and self._parent._fname) else \ - (self._fname.parent if self._fname else None) + if self._parent and self._parent._fname: # pylint: disable=protected-access + return self._parent._fname.parent # pylint: disable=protected-access + return self._fname.parent if self._fname else None # pylint: disable=protected-access def expand(self, tmpl: URITemplate, row: dict, _row, _name=None, qname=False) -> str: """ @@ -1158,7 +872,7 @@ def expand(self, tmpl: URITemplate, row: dict, _row, _name=None, qname=False) -> if tmpl.uri.startswith(prefix + ':'): # If the URI Template is a QName, we expand it to a URL to prevent `Link.resolve` # from turning it into a local path. - res = '{}{}'.format(url, tmpl.uri.split(':')[1]) + res = f"{url}{tmpl.uri.split(':')[1]}" break else: res = Link( @@ -1176,7 +890,51 @@ def expand(self, tmpl: URITemplate, row: dict, _row, _name=None, qname=False) -> return res -@attr.s +@dataclasses.dataclass(frozen=True) +class CsvRow: + """A bag of attributes specifying a row in a CSV file.""" + fname: str + lineno: int + row: list[str] + + +@dataclasses.dataclass +class RowParseSpec: + """A bag of attributes used when parsing a CSV row.""" + strict: bool + log: Optional[logging.Logger] + row_implementation: type = collections.OrderedDict + error: bool = False + + def log_error(self, msg: str): + """Log and record error.""" + utils.log_or_raise(msg, log=self.log) + self.error = True + + +@dataclasses.dataclass +class TableParseSpec: + """Some metadata, categorizing columns in a table.""" + colnames: list[str] = dataclasses.field(default_factory=list) + virtualcols: list[tuple[str, URITemplate]] = dataclasses.field(default_factory=list) + requiredcols: set[str] = dataclasses.field(default_factory=set) + + @classmethod + def from_columns(cls, columns: Iterable[Column]) -> 'TableParseSpec': + """Initialize from columns (e.g. columns property of Schema).""" + res = cls() + for col in columns: + if col.virtual: + if col.valueUrl: + res.virtualcols.append((col.header, col.valueUrl)) + else: + res.colnames.append(col.header) + if col.required: + res.requiredcols.add(col.header) + return res + + +@dataclasses.dataclass class Table(TableLike): """ A table description is an object that describes a table within a CSV file. @@ -1191,7 +949,7 @@ class Table(TableLike): .. seealso:: ``_ """ - suppressOutput = attr.ib(default=False) + suppressOutput: bool = False # pylint: disable=invalid-name _comments = [] def add_foreign_key(self, colref, ref_resource, ref_colref): @@ -1204,31 +962,32 @@ def add_foreign_key(self, colref, ref_resource, ref_colref): """ colref = [colref] if not isinstance(colref, (tuple, list)) else colref if not all(col in [c.name for c in self.tableSchema.columns] for col in colref): - raise ValueError('unknown column in foreignKey {0}'.format(colref)) + raise ValueError(f'unknown column in foreignKey {colref}') self.tableSchema.foreignKeys.append(ForeignKey.fromdict({ 'columnReference': colref, 'reference': {'resource': ref_resource, 'columnReference': ref_colref} })) - def __attrs_post_init__(self): - TableLike.__attrs_post_init__(self) + def __post_init__(self): + TableLike.__post_init__(self) if not self.url: raise ValueError('url property is required for Tables') @property - def local_name(self) -> typing.Union[str, None]: + def local_name(self) -> Union[str, None]: + """The filename of a table.""" return self.url.string if self.url else None def _get_dialect(self) -> Dialect: return self.dialect or (self._parent and self._parent.dialect) or Dialect() def write(self, - items: typing.Iterable[typing.Union[dict, list, tuple]], - fname: typing.Optional[typing.Union[str, pathlib.Path]] = DEFAULT, - base: typing.Optional[typing.Union[str, pathlib.Path]] = None, - strict: typing.Optional[bool] = False, - _zipped: typing.Optional[bool] = False) -> typing.Union[str, int]: + items: Iterable[Union[dict, list, tuple]], + fname: Optional[Union[str, pathlib.Path]] = DEFAULT, + base: Optional[Union[str, pathlib.Path]] = None, + strict: Optional[bool] = False, + _zipped: Optional[bool] = False) -> Union[str, int]: """ Write row items to a CSV file according to the table schema. @@ -1254,13 +1013,12 @@ def write(self, row = [col.write(item[i]) for i, col in enumerate(non_virtual_cols)] else: if strict: - add = set(item.keys()) - {'{}'.format(col) for col in non_virtual_cols} + add = set(item.keys()) - {f'{col}' for col in non_virtual_cols} if add: - raise ValueError("dict contains fields not in fieldnames: {}".format( - ', '.join("'{}'".format(field) for field in add))) + add = ', '.join(f"'{field}'" for field in add) + raise ValueError(f"dict contains fields not in fieldnames: {add}") row = [ - col.write(item.get( - col.header, item.get('{}'.format(col)))) + col.write(item.get(col.header, item.get(f'{col}'))) for col in non_virtual_cols] rowcount += 1 writer.writerow(row) @@ -1278,6 +1036,12 @@ def write(self, return rowcount def check_primary_key(self, log=None, items=None) -> bool: + """Make sure primary keys are unique.""" + # We want to silence error logging when reading table rows, because we are not interested + # in conversion errors here. + nolog = logging.getLogger(__name__) + nolog.addHandler(logging.NullHandler()) + success = True if items is not None: warnings.warn('the items argument of check_primary_key ' @@ -1286,12 +1050,10 @@ def check_primary_key(self, log=None, items=None) -> bool: get_pk = operator.itemgetter(*self.tableSchema.primaryKey) seen = set() # Read all rows in the table, ignoring errors: - for fname, lineno, row in self.iterdicts(log=nolog(), with_metadata=True): + for fname, lineno, row in self.iterdicts(log=nolog, with_metadata=True): pk = get_pk(row) if pk in seen: - log_or_raise( - '{0}:{1} duplicate primary key: {2}'.format(fname, lineno, pk), - log=log) + utils.log_or_raise(f'{fname}:{lineno} duplicate primary key: {pk}', log=log) success = False else: seen.add(pk) @@ -1300,14 +1062,123 @@ def check_primary_key(self, log=None, items=None) -> bool: def __iter__(self): return self.iterdicts() - def iterdicts( + def _get_csv_reader(self, fname, dialect, stack) -> UnicodeReaderWithLineNumber: + if is_url(fname): + handle = io.TextIOWrapper( + io.BytesIO(utils.request_get(str(fname)).content), encoding=dialect.encoding) + else: + handle = fname + fpath = pathlib.Path(fname) + if not fpath.exists(): + zipfname = fpath.parent.joinpath(fpath.name + '.zip') + if zipfname.exists(): + zipf = stack.enter_context(zipfile.ZipFile(zipfname)) # pylint: disable=R1732 + handle = io.TextIOWrapper( + zipf.open([n for n in zipf.namelist() if n.endswith(fpath.name)][0]), + encoding=dialect.encoding) + + return stack.enter_context(UnicodeReaderWithLineNumber(handle, dialect=dialect)) + + def _validated_csv_header(self, header, strict) -> list[str]: + if not strict: + if self.tableSchema.columns and len(self.tableSchema.columns) < len(header): + warnings.warn('Column number mismatch') + for name, col in zip(header, self.tableSchema.columns): + res = col.has_title(name) + if (not col.name) and not res: + warnings.warn('Incompatible table models') + if (isinstance(res, str) and # noqa: W504 + res.split('-')[0] not in ['und', (self.lang or 'und').split('-')[0]]): + warnings.warn('Incompatible column titles') + return header + + def _read_row( + self, + row: CsvRow, + parse_spec: RowParseSpec, + header_cols: list[tuple[int, str, Column]], + spec: TableParseSpec, + ) -> RowType: + required = {h: j for j, h, c in header_cols if c and c.required} + res = parse_spec.row_implementation() + + for (j, k, col), v in zip(header_cols, row.row): + # see http://w3c.github.io/csvw/syntax/#parsing-cells + if col: + try: + res[col.header] = col.read(v, strict=parse_spec.strict) + except ValueError as e: + if not parse_spec.strict: + warnings.warn(f'Invalid column value: {v} {col.datatype}; {e}') + res[col.header] = v + else: + parse_spec.log_error(f'{row.fname}:{row.lineno}:{j + 1} {k}: {e}') + if k in required: + del required[k] + else: + if parse_spec.strict: + warnings.warn(f'Unspecified column "{k}" in table {self.local_name}') + res[k] = v + + for k, j in required.items(): + if k not in res: + parse_spec.log_error( + f'{row.fname}:{row.lineno}:{j + 1} {k}: required column value is missing') + + # Augment result with regular columns not provided in the data: + for key in spec.colnames: + res.setdefault(key, None) + + # Augment result with virtual columns: + for key, value_url in spec.virtualcols: + res[key] = value_url.expand(**res) + return res + + def _get_header_cols( + self, + header: list[str], + colnames: list[str], + strict: bool, + row: Iterable, + ) -> list[tuple[int, str, Column]]: + def default_col(index): + return Column.fromvalue({'name': f'_col.{index}'}) + + # If columns in the data are ordered as in the spec, we can match values to + # columns by index, rather than looking up columns by name. + if (header == colnames) or \ + (len(self.tableSchema.columns) >= len(header) and not strict): + # Note that virtual columns are only allowed to come **after** regular ones, + # so we can simply zip the whole columns list, and silently ignore surplus + # virtual columns. + header_cols = list(zip(header, self.tableSchema.columns)) + elif not strict and self.tableSchema.columns and \ + (len(self.tableSchema.columns) < len(header)): + header_cols = [] + for i, cname in enumerate(header): + try: + header_cols.append((cname, self.tableSchema.columns[i])) + except IndexError: + col = default_col(i + 1) + header_cols.append((col.name, col)) + else: + header_cols = [(h, self.tableSchema.get_column(h)) for h in header] + + if not header_cols: + for i, _ in enumerate(row): + col = default_col(i + 1) + header_cols.append((col.name, col)) + + return [(j, h, c) for j, (h, c) in enumerate(header_cols)] + + def iterdicts( # pylint: disable=too-many-locals self, - log=None, - with_metadata=False, + log: Optional[logging.Logger] = None, + with_metadata: bool = False, fname=None, - _Row=collections.OrderedDict, + _Row: type = collections.OrderedDict, # pylint: disable=invalid-name strict=True, - ) -> typing.Generator[dict, None, None]: + ) -> Generator[Union[dict[str, Any], tuple[str, int, dict[str, Any]]], None, None]: """Iterate over the rows of the table Create an iterator that maps the information in each row to a `dict` whose keys are @@ -1330,147 +1201,71 @@ def iterdicts( """ dialect = self._get_dialect() fname = fname or self.url.resolve(self.base) - colnames, virtualcols, requiredcols = [], [], set() - for col in self.tableSchema.columns: - if col.virtual: - if col.valueUrl: - virtualcols.append((col.header, col.valueUrl)) - else: - colnames.append(col.header) - if col.required: - requiredcols.add(col.header) + + table_parse_spec = TableParseSpec.from_columns(self.tableSchema.columns) with contextlib.ExitStack() as stack: - if is_url(fname): - handle = io.TextIOWrapper( - io.BytesIO(requests.get(str(fname)).content), encoding=dialect.encoding) - else: - handle = fname - fpath = pathlib.Path(fname) - if not fpath.exists(): - zipfname = fpath.parent.joinpath(fpath.name + '.zip') - if zipfname.exists(): - zipf = stack.enter_context(zipfile.ZipFile(str(zipfname))) - handle = io.TextIOWrapper( - zipf.open([n for n in zipf.namelist() if n.endswith(fpath.name)][0]), - encoding=dialect.encoding) - - reader = stack.enter_context(UnicodeReaderWithLineNumber(handle, dialect=dialect)) - reader = iter(reader) + reader = iter(self._get_csv_reader(fname, dialect, stack)) # If the data file has a header row, this row overrides the header as # specified in the metadata. if dialect.header: try: - _, header = next(reader) - if not strict: - if self.tableSchema.columns and len(self.tableSchema.columns) < len(header): - warnings.warn('Column number mismatch') - for name, col in zip(header, self.tableSchema.columns): - res = col.has_title(name) - if (not col.name) and not res: - warnings.warn('Incompatible table models') - if isinstance(res, str) and res.split('-')[0] not in [ - 'und', (self.lang or 'und').split('-')[0]]: - warnings.warn('Incompatible column titles') + header = self._validated_csv_header(next(reader)[1], strict) except StopIteration: # pragma: no cover return else: - header = colnames - - # If columns in the data are ordered as in the spec, we can match values to - # columns by index, rather than looking up columns by name. - if (header == colnames) or \ - (len(self.tableSchema.columns) >= len(header) and not strict): - # Note that virtual columns are only allowed to come **after** regular ones, - # so we can simply zip the whole columns list, and silently ignore surplus - # virtual columns. - header_cols = list(zip(header, self.tableSchema.columns)) - elif not strict and self.tableSchema.columns and \ - (len(self.tableSchema.columns) < len(header)): - header_cols = [] - for i, cname in enumerate(header): - try: - header_cols.append((cname, self.tableSchema.columns[i])) - except IndexError: - header_cols.append(( - '_col.{}'.format(i + 1), - Column.fromvalue({'name': '_col.{}'.format(i + 1)}))) - else: - header_cols = [(h, self.tableSchema.get_column(h)) for h in header] - header_cols = [(j, h, c) for j, (h, c) in enumerate(header_cols)] - missing = requiredcols - set(c.header for j, h, c in header_cols if c) - if missing: - raise ValueError('{0} is missing required columns {1}'.format(fname, missing)) - - for lineno, row in reader: - required = {h: j for j, h, c in header_cols if c and c.required} - res = _Row() - error = False - if (not header_cols) and row: - header_cols = [ - (i, - '_col.{}'.format(i + 1), - Column.fromvalue({'name': '_col.{}'.format(i + 1)})) - for i, _ in enumerate(row)] - for (j, k, col), v in zip(header_cols, row): - # see http://w3c.github.io/csvw/syntax/#parsing-cells - if col: - try: - res[col.header] = col.read(v, strict=strict) - except ValueError as e: - if not strict: - warnings.warn( - 'Invalid column value: {} {}; {}'.format(v, col.datatype, e)) - res[col.header] = v - else: - log_or_raise( - '{0}:{1}:{2} {3}: {4}'.format(fname, lineno, j + 1, k, e), - log=log) - error = True - if k in required: - del required[k] - else: - if strict: - warnings.warn( - 'Unspecified column "{0}" in table {1}'.format(k, self.local_name)) - res[k] = v - - for k, j in required.items(): - if k not in res: - log_or_raise( - '{0}:{1}:{2} {3}: {4}'.format( - fname, lineno, j + 1, k, 'required column value is missing'), - log=log) - error = True - - # Augment result with regular columns not provided in the data: - for key in colnames: - res.setdefault(key, None) - - # Augment result with virtual columns: - for key, valueUrl in virtualcols: - res[key] = valueUrl.expand(**res) - - if not error: - if with_metadata: - yield fname, lineno, res - else: - yield res + header = table_parse_spec.colnames + + header_cols = None + for i, (lineno, row) in enumerate(reader): + if i == 0: + header_cols = self._get_header_cols( + header, table_parse_spec.colnames, strict, row) + missing = table_parse_spec.requiredcols - \ + {c.header for j, h, c in header_cols if c} + if missing: + raise ValueError(f'{fname} is missing required columns {missing}') + + parse_spec = RowParseSpec(strict=strict, log=log, row_implementation=_Row) + res = self._read_row( + CsvRow(fname=fname, lineno=lineno, row=row), + parse_spec, + header_cols, + table_parse_spec, + ) + if not parse_spec.error: + yield (fname, lineno, res) if with_metadata else res self._comments = reader.comments -def converter_tables(v): - res = [] - for vv in v: - if not isinstance(vv, (dict, Table)): - warnings.warn('Invalid value for Table spec') - else: - res.append(Table.fromvalue(vv) if isinstance(vv, dict) else vv) - return res +@dataclasses.dataclass(frozen=True) +class ForeignKeyInstance: + """Simple structure holding the specification of a foreign key.""" + target_table: Table + pk: ColRefType + source_table: Table + fk: ColRefType + + def validate(self, strict: bool) -> None: + """Checks whether the colrefs for fk and pk match.""" + if len(self.fk) != len(self.pk): + raise ValueError( + 'Foreign key error: non-matching number of columns in source and target') + for scol, tcol in zip(self.fk, self.pk): + scolumn = self.source_table.tableSchema.get_column(scol, strict=strict) + tcolumn = self.target_table.tableSchema.get_column(tcol, strict=strict) + if not (scolumn and tcolumn): + raise ValueError( + f'Foreign key error: missing column "{scol}" or "{tcol}"') + if scolumn.datatype and tcolumn.datatype and \ + scolumn.datatype.base != tcolumn.datatype.base: + raise ValueError( + f'Foregin key error: non-matching datatype "{scol}:{scolumn.datatype.base}" ' + f'or "{tcol}:{tcolumn.datatype.base}"') -@attr.s +@dataclasses.dataclass class TableGroup(TableLike): """ A table group description is an object that describes a group of tables. @@ -1485,15 +1280,23 @@ class TableGroup(TableLike): .. seealso:: ``_ """ - tables = attr.ib(repr=False, default=attr.Factory(list), converter=converter_tables) + tables: list[Table] = dataclasses.field(default_factory=list) - def __attrs_post_init__(self): - TableLike.__attrs_post_init__(self) + def __post_init__(self): + res = [] + for vv in self.tables: + if not isinstance(vv, (dict, Table)): + warnings.warn('Invalid value for Table spec') + else: + res.append(Table.fromvalue(vv) if isinstance(vv, dict) else vv) + self.tables = res + super().__post_init__() for table in self.tables: - table._parent = self + table._parent = self # pylint: disable=protected-access @classmethod def from_frictionless_datapackage(cls, dp): + """Initialize a TableGroup from a frictionless DataPackage.""" return DataPackage(dp).to_tablegroup(cls) def read(self): @@ -1503,10 +1306,10 @@ def read(self): return {tname: list(t.iterdicts()) for tname, t in self.tabledict.items()} def write(self, - fname: typing.Union[str, pathlib.Path], - strict: typing.Optional[bool] = False, - _zipped: typing.Optional[bool] = False, - **items: typing.Iterable[typing.Union[list, tuple, dict]]): + fname: Union[str, pathlib.Path], + strict: Optional[bool] = False, + _zipped: Optional[bool] = False, + **items: Iterable[Union[list, tuple, dict]]): """ Write a TableGroup's data and metadata to files. @@ -1519,7 +1322,7 @@ def write(self, self.tabledict[tname].write(rows, base=fname.parent, strict=strict, _zipped=_zipped) self.to_file(fname) - def copy(self, dest: typing.Union[pathlib.Path, str]): + def copy(self, dest: Union[pathlib.Path, str]): """ Write a TableGroup's data and metadata to files relative to `dest`, adapting the `base` attribute. @@ -1534,38 +1337,31 @@ def copy(self, dest: typing.Union[pathlib.Path, str]): self.to_file(self._fname) @property - def tabledict(self) -> typing.Dict[str, Table]: + def tabledict(self) -> dict[str, Table]: + """Convenient access to tables by name.""" return {t.local_name: t for t in self.tables} - def foreign_keys(self) -> typing.List[typing.Tuple[Table, list, Table, list]]: - return [ - ( - self.tabledict[fk.reference.resource.string], - fk.reference.columnReference, - t, - fk.columnReference) - for t in self.tables for fk in t.tableSchema.foreignKeys - if not fk.reference.schemaReference] - - def validate_schema(self, strict=False): + def validate_schema(self, strict: bool = False) -> list[ForeignKeyInstance]: + """Check whether pk and fk specs in foreign key constraints match.""" + try: + fkis = sorted( + [ + ForeignKeyInstance( + self.tabledict[fk.reference.resource.string], + tuple(fk.reference.columnReference), + t, + tuple(fk.columnReference)) + for t in self.tables for fk in t.tableSchema.foreignKeys + if not fk.reference.schemaReference], + key=lambda x: (x.target_table.local_name, x.pk, x.source_table.local_name)) + except KeyError as e: + raise ValueError(f'Foreign key error: missing table "{e}" referenced') from e try: - for st, sc, tt, tc in self.foreign_keys(): - if len(sc) != len(tc): - raise ValueError( - 'Foreign key error: non-matching number of columns in source and target') - for scol, tcol in zip(sc, tc): - scolumn = st.tableSchema.get_column(scol, strict=strict) - tcolumn = tt.tableSchema.get_column(tcol, strict=strict) - if not (scolumn and tcolumn): - raise ValueError( - 'Foregin key error: missing column "{}" or "{}"'.format(scol, tcol)) - if scolumn.datatype and tcolumn.datatype and \ - scolumn.datatype.base != tcolumn.datatype.base: - raise ValueError( - 'Foregin key error: non-matching datatype "{}:{}" or "{}:{}"'.format( - scol, scolumn.datatype.base, tcol, tcolumn.datatype.base)) - except (KeyError, AssertionError) as e: - raise ValueError('Foreign key error: missing table "{}" referenced'.format(e)) + for fki in fkis: + fki.validate(strict=strict) + except AssertionError as e: + raise ValueError(f'Foreign key error: missing column "{e}" referenced') from e + return fkis def check_referential_integrity(self, data=None, log=None, strict=False): """ @@ -1579,69 +1375,142 @@ def check_referential_integrity(self, data=None, log=None, strict=False): for fk in t.tableSchema.foreignKeys: for row in t: if any(row.get(col) is None for col in fk.columnReference): - raise ValueError('Foreign key column is null: {} {}'.format( - [row.get(col) for col in fk.columnReference], fk.columnReference)) + raise ValueError( + f'Foreign key column is null: ' + f'{[row.get(col) for col in fk.columnReference]} ' + f'{fk.columnReference}') try: - self.validate_schema() + fkis = self.validate_schema() success = True except ValueError as e: + fkis = [] success = False - log_or_raise(str(e), log=log, level='error') - fkeys = self.foreign_keys() - # FIXME: We only support Foreign Key references between tables! - fkeys = sorted(fkeys, key=lambda x: (x[0].local_name, x[1], x[2].local_name)) + utils.log_or_raise(str(e), log=log, level='error') + + # FIXME: We only support Foreign Key references between tables! pylint: disable=W0511 + # We group foreign key constraints by target table, because we only want to read the + # available primary keys once and then check all tables referencing the target table in + # a loop. + # # Grouping by local_name of tables - even though we'd like to have the table objects # around, too. This it to prevent going down the rabbit hole of comparing table objects # for equality, when comparison of the string names is enough. - for _, grp in itertools.groupby(fkeys, lambda x: x[0].local_name): + for _, grp in itertools.groupby(fkis, lambda x: x.target_table.local_name): grp = list(grp) - table = grp[0][0] - t_fkeys = [(key, [(child, ref) for _, _, child, ref in kgrp]) - for key, kgrp in itertools.groupby(grp, lambda x: x[1])] - get_seen = [(operator.itemgetter(*key), set()) for key, _ in t_fkeys] - for row in table.iterdicts(log=log): - for get, seen in get_seen: - if get(row) in seen: - # column references for a foreign key are not unique! - if strict: - success = False - seen.add(get(row)) - for (key, children), (_, seen) in zip(t_fkeys, get_seen): - single_column = (len(key) == 1) - for child, ref in children: - get_ref = operator.itemgetter(*ref) - for fname, lineno, item in child.iterdicts(log=log, with_metadata=True): - colref = get_ref(item) - if colref is None: - continue - elif single_column and isinstance(colref, list): - # We allow list-valued columns as foreign key columns in case - # it's not a composite key. If a foreign key is list-valued, we - # check for a matching row for each of the values in the list. - colrefs = colref - else: - colrefs = [colref] - for colref in colrefs: - if not single_column and None in colref: # pragma: no cover - # TODO: raise if any(c is not None for c in colref)? - continue - elif colref not in seen: - log_or_raise( - '{0}:{1} Key `{2}` not found in table {3}'.format( - fname, - lineno, - colref, - table.url.string), - log=log) - success = False + target_table = grp[0].target_table + fks = collections.OrderedDict() + for pk, kgrp in itertools.groupby(grp, lambda x: x.pk): + fks[tuple(pk)] = [(fk.source_table, tuple(fk.fk)) for fk in kgrp] + success = self._check_fks_referencing_table(success, target_table, fks, strict, log) + return success + + @staticmethod + def _check_fks_referencing_table( + success: bool, + target_table: Table, + fks: collections.OrderedDict[ColRefType, list[tuple[Table, ColRefType]]], + strict: bool, + log: logging.Logger, + ) -> bool: + """Check all foreign keys referencing the same table.""" + target_table = ReferencedTable( + target_table, collections.OrderedDict((fk, len(fk) == 1) for fk in fks), log) + # Now read the available primary keys for each foreign key constraint to the table. + success = target_table.get_pks(success, strict) + for pk, source_tables in fks.items(): + # For each foreign key constraint referencing `target_table` we check the fk values. + for source_table, fk in source_tables: + success = target_table.check_fks(success, pk, source_table, fk) + return success + + +@dataclasses.dataclass +class ReferencedTable: + """ + Wraps a Table object to simplify checking of foreign key references. + """ + table: Table + # The colrefs which are referenced in foreign keys to the table mapped to whether they are a + # single column or a composite key: + pks: collections.OrderedDict[ColRefType, bool] + log: logging.Logger + # We store values in table rows for each pk colref: + refs: dict[ColRefType, set] = dataclasses.field( + default_factory=lambda: collections.defaultdict(set)) + + def get_pks(self, success: bool, strict: bool) -> bool: + """Read the actual fk values in the table.""" + itemgetters = {pk: operator.itemgetter(*pk) for pk in self.pks} + for row in self.table.iterdicts(log=self.log): + for pk in self.pks: + vals = itemgetters[pk](row) + if vals in self.refs[pk]: + # Values for a primary key are not unique! + # https://w3c.github.io/csvw/tests/#manifest-validation#test258 + if strict: + success = False + self.refs[pk].add(vals) + return success + + def _check_item(self, success: bool, vals: 'RefValues', pk: ColRefType) -> bool: + """ + We check if the value for the foreign key are available in the referenced table. + """ + pks = self.refs[pk] + single_column = self.pks[pk] + if vals.values is None: # null-valued foreign key. + return success + if single_column and isinstance(vals.values, list): + # We allow list-valued columns as foreign key columns in case it's not a composite key. + # If a foreign key is list-valued, we check for a matching row for each of the values + # in the list. + refs = vals.values + else: + refs = [vals.values] + for ref in refs: + if not single_column and None in ref: # pragma: no cover + # A composite key and one component of the fk is null? + # TODO: raise if any(c is not None for c in values)? pylint: disable=W0511 + continue + if ref not in pks: + utils.log_or_raise( + f'{vals} not found in table {self.table.url.string}', log=self.log) + success = False return success + def check_fks( + self, + success: bool, + pk: ColRefType, + source_table: Table, + fk: ColRefType, + ) -> bool: + """ + Check one fk constraint, i.e. whether the fk values in self.table actually can be found + in `target_table`. + """ + for fname, lineno, item in source_table.iterdicts(log=self.log, with_metadata=True): + item = RefValues(fname=fname, lineno=lineno, values=operator.itemgetter(*fk)(item)) + success = self._check_item(success, item, pk) + return success + + +@dataclasses.dataclass(frozen=True) +class RefValues: + """Bundle properties of a table row for simpler checking.""" + fname: str + lineno: int + values: Union[str, list[str]] + + def __str__(self): + return f'{self.fname}:{self.lineno} Key `{self.values}`' + class CSVW: """ Python API to read CSVW described data and convert it to JSON. """ - def __init__(self, url: str, md_url: typing.Optional[str] = None, validate: bool = False): + def __init__(self, url: str, md_url: Optional[str] = None, validate: bool = False): self.warnings = [] w = None with contextlib.ExitStack() as stack: @@ -1650,7 +1519,7 @@ def __init__(self, url: str, md_url: typing.Optional[str] = None, validate: bool no_header = False try: - md = get_json(md_url or url) + md = utils.get_json(md_url or url) # The URL could be read as JSON document, thus, the user supplied us with overriding # metadata as per https://w3c.github.io/csvw/syntax/#overriding-metadata except json.decoder.JSONDecodeError: @@ -1660,24 +1529,7 @@ def __init__(self, url: str, md_url: typing.Optional[str] = None, validate: bool self.no_metadata = set(md.keys()) == {'@context', 'url'} if "http://www.w3.org/ns/csvw" not in md.get('@context', ''): raise ValueError('Invalid or no @context') - if 'tables' in md: - if not md['tables'] or not isinstance(md['tables'], list): - raise ValueError('Invalid TableGroup with empty tables property') - if is_url(url): - self.t = TableGroup.from_url(url, data=md) - self.t.validate_schema(strict=True) - else: - self.t = TableGroup.from_file(url, data=md) - else: - if is_url(url): - self.t = Table.from_url(url, data=md) - if no_header: - if self.t.dialect: - self.t.dialect.header = False # pragma: no cover - else: - self.t.dialect = Dialect(header=False) - else: - self.t = Table.from_file(url, data=md) + self._set_tables(md, url, no_header) self.tables = self.t.tables if isinstance(self.t, TableGroup) else [self.t] for table in self.tables: for col in table.tableSchema.columns: @@ -1687,6 +1539,26 @@ def __init__(self, url: str, md_url: typing.Optional[str] = None, validate: bool if w: self.warnings.extend(w) + def _set_tables(self, md, url, no_header): + if 'tables' in md: + if not md['tables'] or not isinstance(md['tables'], list): + raise ValueError('Invalid TableGroup with empty tables property') + if is_url(url): + self.t = TableGroup.from_url(url, data=md) + self.t.validate_schema(strict=True) + else: + self.t = TableGroup.from_file(url, data=md) + else: + if is_url(url): + self.t = Table.from_url(url, data=md) + if no_header: + if self.t.dialect: + self.t.dialect.header = False # pragma: no cover + else: + self.t.dialect = Dialect(header=False) + else: + self.t = Table.from_file(url, data=md) + @property def is_valid(self) -> bool: """ @@ -1712,19 +1584,21 @@ def is_valid(self) -> bool: return not bool(self.warnings) @property - def tablegroup(self): + def tablegroup(self) -> TableGroup: + """The table spec.""" return self.t if isinstance(self.t, TableGroup) else \ TableGroup(at_props={'base': self.t.base}, tables=self.tables) @staticmethod - def locate_metadata(url=None) -> typing.Tuple[dict, bool]: + def locate_metadata(url=None) -> tuple[dict, bool]: """ Implements metadata discovery as specified in `§5. Locating Metadata `_ """ def describes(md, url): for table in md.get('tables', [md]): - # FIXME: We check whether the metadata describes a CSV file just superficially, + # FIXME: pylint: disable=W0511 + # We check whether the metadata describes a CSV file just superficially, # by comparing the last path components of the respective URLs. if url.split('/')[-1] == table['url'].split('/')[-1]: return True @@ -1734,24 +1608,24 @@ def describes(md, url): if url and is_url(url): # §5.2 Link Header # https://w3c.github.io/csvw/syntax/#link-header - res = requests.head(url) - no_header = bool(re.search(r'header\s*=\s*absent', res.headers.get('content-type', ''))) - desc = res.links.get('describedby') - if desc and desc['type'] in [ - "application/csvm+json", "application/ld+json", "application/json"]: - md = get_json(Link(desc['url']).resolve(url)) - if describes(md, url): - return md, no_header - else: - warnings.warn('Ignoring linked metadata because it does not reference the data') + content_type, links = utils.request_head(url) + no_header = bool(re.search(r'header\s*=\s*absent', content_type)) + for link in links: + if link.params.get('rel') == 'describedby': + if link.params.get('type') in [ + "application/csvm+json", "application/ld+json", "application/json"]: + md = utils.get_json(Link(link.url).resolve(url)) + if describes(md, url): + return md, no_header + warnings.warn('Ignoring linked metadata because it does not reference the data') # §5.3 Default Locations and Site-wide Location Configuration # https://w3c.github.io/csvw/syntax/ # #default-locations-and-site-wide-location-configuration - res = requests.get(Link('/.well-known/csvm').resolve(url)) + res = utils.request_get(Link('/.well-known/csvm').resolve(url)) locs = res.text if res.status_code == 200 else '{+url}-metadata.json\ncsv-metadata.json' for line in locs.split('\n'): - res = requests.get(Link(URITemplate(line).expand(url=url)).resolve(url)) + res = utils.request_get(Link(URITemplate(line).expand(url=url)).resolve(url)) if res.status_code == 200: try: md = res.json() @@ -1767,7 +1641,7 @@ def describes(md, url): elif url: # Default Locations for local files: if pathlib.Path(str(url) + '-metadata.json').exists(): - return get_json(pathlib.Path(str(url) + '-metadata.json')), no_header + return utils.get_json(pathlib.Path(str(url) + '-metadata.json')), no_header res = { '@context': "http://www.w3.org/ns/csvw", 'url': url, @@ -1799,7 +1673,7 @@ def to_json(self, minimal=False): def _table_to_json(self, table): res = collections.OrderedDict() - # FIXME: id + # FIXME: id pylint: disable=W0511 res['url'] = str(table.url.resolve(table.base)) if 'id' in table.at_props: res['@id'] = table.at_props['id'] @@ -1819,14 +1693,14 @@ def _table_to_json(self, table): for rownum, (_, rowsourcenum, row) in enumerate( table.iterdicts(with_metadata=True, strict=False), start=1) ] - if table._comments: - res['rdfs:comment'] = [c[1] for c in table._comments] + if table._comments: # pylint: disable=W0212 + res['rdfs:comment'] = [c[1] for c in table._comments] # pylint: disable=W0212 res['row'] = row return res - def _row_to_json(self, table, cols, row, rownum, rowsourcenum): + def _row_to_json(self, table, cols, row, rownum, rowsourcenum): # pylint: disable=R0913,R0917 res = collections.OrderedDict() - res['url'] = '{}#row={}'.format(table.url.resolve(table.base), rowsourcenum) + res['url'] = f'{table.url.resolve(table.base)}#row={rowsourcenum}' res['rownum'] = rownum if table.tableSchema.rowTitles: res['titles'] = [ @@ -1842,7 +1716,7 @@ def _row_to_json(self, table, cols, row, rownum, rowsourcenum): def _describes(self, table, cols, row, rownum): triples = [] - aboutUrl = table.tableSchema.inherit('aboutUrl') + aboutUrl = table.tableSchema.inherit('aboutUrl') # pylint: disable=invalid-name if aboutUrl: triples.append(jsonld.Triple( about=None, property='@id', value=table.expand(aboutUrl, row, _row=rownum))) @@ -1854,16 +1728,14 @@ def _describes(self, table, cols, row, rownum): # Skip null values: null = col.inherit_null() if col else table.inherit_null() - if (null and v in null) or v == "" or (v is None) or \ - (col and col.separator and v == []): + if any([null and v in null, v == "", v is None, col and col.separator and v == []]): continue triples.append(jsonld.Triple.from_col( table, col, row, - '_col.{}'.format(i) - if (not table.tableSchema.columns and not self.no_metadata) else k, + f'_col.{i}' if (not table.tableSchema.columns and not self.no_metadata) else k, v, rownum)) diff --git a/src/csvw/metadata_utils.py b/src/csvw/metadata_utils.py new file mode 100644 index 0000000..52fcc89 --- /dev/null +++ b/src/csvw/metadata_utils.py @@ -0,0 +1,472 @@ +""" +Helpers to model CSVW metadata as dataclasses. +""" +import re +import copy +import html +import json +import decimal +import warnings +import collections +from collections.abc import Generator +import dataclasses +from typing import Any, Optional, Union, TYPE_CHECKING + +from language_tags import tags + +from .utils import is_url, slug + +if TYPE_CHECKING: + from csvw.metadata import TableGroup # pragma: no cover + +__all__ = ['valid_common_property', 'valid_id_property', 'valid_context_property', + 'DescriptionBase', 'dataclass_asdict', 'NAMESPACES', 'dialect_props'] + +NumberType = Union[int, float, decimal.Decimal] +NAMESPACES = { + 'csvw': 'http://www.w3.org/ns/csvw#', + 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', + 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', + 'xsd': 'http://www.w3.org/2001/XMLSchema#', + 'dc': 'http://purl.org/dc/terms/', + 'dcat': 'http://www.w3.org/ns/dcat#', + 'prov': 'http://www.w3.org/ns/prov#', + 'schema': 'http://schema.org/', + "as": "https://www.w3.org/ns/activitystreams#", + "cc": "http://creativecommons.org/ns#", + "ctag": "http://commontag.org/ns#", + "dc11": "http://purl.org/dc/elements/1.1/", + "dctypes": "http://purl.org/dc/dcmitype/", + "dqv": "http://www.w3.org/ns/dqv#", + "duv": "https://www.w3.org/ns/duv#", + "foaf": "http://xmlns.com/foaf/0.1/", + "gr": "http://purl.org/goodrelations/v1#", + "grddl": "http://www.w3.org/2003/g/data-view#", + "ical": "http://www.w3.org/2002/12/cal/icaltzd#", + "jsonld": "http://www.w3.org/ns/json-ld#", + "ldp": "http://www.w3.org/ns/ldp#", + "ma": "http://www.w3.org/ns/ma-ont#", + "oa": "http://www.w3.org/ns/oa#", + "odrl": "http://www.w3.org/ns/odrl/2/", + "og": "http://ogp.me/ns#", + "org": "http://www.w3.org/ns/org#", + "owl": "http://www.w3.org/2002/07/owl#", + "qb": "http://purl.org/linked-data/cube#", + "rdfa": "http://www.w3.org/ns/rdfa#", + "rev": "http://purl.org/stuff/rev#", + "rif": "http://www.w3.org/2007/rif#", + "rr": "http://www.w3.org/ns/r2rml#", + "sd": "http://www.w3.org/ns/sparql-service-description#", + "sioc": "http://rdfs.org/sioc/ns#", + "skos": "http://www.w3.org/2004/02/skos/core#", + "skosxl": "http://www.w3.org/2008/05/skos-xl#", + "sosa": "http://www.w3.org/ns/sosa/", + "ssn": "http://www.w3.org/ns/ssn/", + "time": "http://www.w3.org/2006/time#", + "v": "http://rdf.data-vocabulary.org/#", + "vcard": "http://www.w3.org/2006/vcard/ns#", + "void": "http://rdfs.org/ns/void#", + "wdr": "http://www.w3.org/2007/05/powder#", + "wrds": "http://www.w3.org/2007/05/powder-s#", + "xhv": "http://www.w3.org/1999/xhtml/vocab#", + "xml": "http://www.w3.org/XML/1998/namespace", +} +CSVW_TERMS = """Cell +Column +Datatype +Dialect +Direction +ForeignKey +JSON +NumericFormat +Row +Schema +Table +TableGroup +TableReference +Transformation +aboutUrl +base +columnReference +columns +commentPrefix +datatype +decimalChar +default +delimiter +describes +dialect +doubleQuote +encoding +foreignKeys +format +groupChar +header +headerRowCount +json +lang +length +lineTerminators +maxExclusive +maxInclusive +maxLength +maximum +minExclusive +minInclusive +minLength +minimum +name +notes +null +ordered +pattern +primaryKey +propertyUrl +quoteChar +reference +referencedRows +required +resource +row +rowTitles +rownum +schemaReference +scriptFormat +separator +skipBlankRows +skipColumns +skipInitialSpace +skipRows +source +suppressOutput +tableDirection +tableSchema +tables +targetFormat +textDirection +titles +transformations +trim +uriTemplate +url +valueUrl +virtual""".split() + + +def dataclass_asdict(obj, omit_defaults: bool = True, omit_private: bool = True) -> dict[str, Any]: + """Enhanced conversion of dataclass instances to a dict.""" + res = collections.OrderedDict() + for field in dataclasses.fields(obj): + default = field.default_factory() if callable(field.default_factory) else field.default + if not (omit_private and field.name.startswith('_')): + value = getattr(obj, field.name) + if not (omit_defaults and value == default): + if hasattr(value, 'asdict'): + value = value.asdict(omit_defaults=True) + res[field.name] = value + return res + + +def valid_id_property(v: str) -> Optional[str]: + """Validator for the @id property.""" + if not isinstance(v, str): + warnings.warn('Inconsistent link property') + return None + if v.startswith('_'): + raise ValueError(f'Invalid @id property: {v}') + return v + + +def valid_context_property(ctx: Union[None, str, list]) -> Union[None, str, list]: + """ + Make sure the requirements for @context objects in CSVW are met. + If not, warn or raise exceptions accordingly. + """ + nsurl = NAMESPACES['csvw'].replace('#', '') + if ctx is None: + return ctx + if isinstance(ctx, str): + assert ctx == nsurl + return ctx + assert isinstance(ctx, list), ctx + for obj in ctx: + if any((isinstance(obj, dict) and not set(obj.keys()).issubset({'@base', '@language'}), + isinstance(obj, str) and obj != nsurl)): + raise ValueError( + f'The @context MUST have one of the following values: An array composed of a ' + f'string followed by an object, where the string is {nsurl} and the ' + f'object represents a local context definition, which is restricted to contain ' + f'either or both of @base and @language.') + if isinstance(obj, dict) and '@language' in obj and not tags.check(obj['@language']): + warnings.warn('Invalid value for @language property') + del obj['@language'] + return ctx + + +def valid_common_property(v): # pylint: disable=too-many-branches + """Validator for values of common properties.""" + if not isinstance(v, (dict, list)): + # No JSON container types. We'll just assume all is good. + return v + + if isinstance(v, list): # Recurse into the items. + return [valid_common_property(vv) for vv in v] + + if not {k[1:] for k in v if k.startswith('@')}.issubset({'id', 'language', 'type', 'value'}): + raise ValueError( + "Aside from @value, @type, @language, and @id, the properties used on an object " + "MUST NOT start with @.") + if '@value' in v: + if any(( + len(v) > 2, + set(v.keys()) not in [{'@value', '@language'}, {'@value', '@type'}], + not isinstance(v['@value'], (str, bool, int, float, decimal.Decimal)) + )): + raise ValueError( + "If a @value property is used on an object, that object MUST NOT have any other " + "properties aside from either @type or @language, and MUST NOT have both @type and " + "@language as properties. The value of the @value property MUST be a string, " + "number, or boolean value.") + if '@language' in v and '@value' not in v: + raise ValueError( + "A @language property MUST NOT be used on an object unless it also has a @value " + "property.") + if '@id' in v: + v['@id'] = valid_id_property(v['@id']) + if '@language' in v: + if not (isinstance(v['@language'], str) and tags.check(v['@language'])): + warnings.warn('Invalid language tag') + del v['@language'] + if '@type' in v: + vv = v['@type'] + if isinstance(vv, str): + if vv.startswith('_:'): + raise ValueError( + 'The value of any @id or @type contained within a metadata document ' + 'MUST NOT be a blank node.') + if not any(( + is_url(vv), + any(vv == ns or vv.startswith(ns + ':') for ns in NAMESPACES), + vv in CSVW_TERMS + )): + raise ValueError( + 'The value of any member of @type MUST be either a term defined in ' + '[csvw-context], a prefixed name where the prefix is a term defined in ' + '[csvw-context], or an absolute URL.') + elif not isinstance(vv, (list, dict)): + raise ValueError('Invalid datatype for @type') + return {k: valid_common_property(vv) for k, vv in v.items()} + + +@dataclasses.dataclass +class DescriptionBase: + """Container for + - common properties (see http://w3c.github.io/csvw/metadata/#common-properties) + - @-properties. + """ + common_props: dict[str, Any] = dataclasses.field(default_factory=dict) + at_props: dict[str, Any] = dataclasses.field(default_factory=dict) + + @classmethod + def partition_properties( + cls, + d: Union[dict, Any], + type_name: Optional[str] = None, + strict: bool = True + ) -> Union[dict, None]: + """ + Partitions properties in d into `common_props`, `at_props` and the remaining. + """ + if d and not isinstance(d, dict): + return None + fields = {f.name: f for f in dataclasses.fields(cls)} + type_name = type_name or cls.__name__ + c, a, dd = {}, {}, {} + for k, v in (d or {}).items(): + if k.startswith('@'): + if k == '@id': + v = valid_id_property(v) + if k == '@type' and v != type_name: + raise ValueError(f'Invalid @type property {v} for {type_name}') + a[k[1:]] = v + elif ':' in k: + c[k] = valid_common_property(v) + else: + if strict and (k not in fields): + warnings.warn(f'Invalid property {k} for {type_name}') + else: + dd[k] = v + return dict(common_props=c, at_props=a, **dd) # pylint: disable=R1735 + + @classmethod + def fromvalue(cls, d: dict): + """Initialize instance from dict.""" + return cls(**cls.partition_properties(d)) + + def _iter_dict_items(self, omit_defaults) -> Generator[tuple[str, Any], None, None]: + def _asdict_single(v): + return v.asdict(omit_defaults=omit_defaults) if hasattr(v, 'asdict') else v + + def _asdict_multiple(v): + if isinstance(v, (list, tuple)): + return [_asdict_single(vv) for vv in v] + return _asdict_single(v) + + for k, v in sorted(self.at_props.items()): + yield '@' + k, _asdict_multiple(v) + + for k, v in sorted(self.common_props.items()): + yield k, _asdict_multiple(v) + + for k, v in dataclass_asdict(self, omit_defaults=omit_defaults).items(): + if k not in ('common_props', 'at_props'): + yield k, _asdict_multiple(v) + + def asdict(self, omit_defaults=True) -> collections.OrderedDict[str, Any]: + """Serialization as dict.""" + # Note: The `null` property is the only inherited, list-valued property where the default + # is not the empty list. Thus, to allow setting it to empty, we must treat `null` as + # special case here. + # See also https://www.w3.org/TR/tabular-metadata/#dfn-inherited-property + return collections.OrderedDict( + (k, v) for k, v in self._iter_dict_items(omit_defaults) + if (k == 'null' or (v not in ([], {})))) + + +def dialect_props(d: dict[str, Any]) -> dict: + """Slightly massage the a dialect specification into something accepted by our Dialect class.""" + if not isinstance(d, dict): + warnings.warn('Invalid dialect spec') + return {} + partitioned = DescriptionBase.partition_properties(d, type_name='Dialect', strict=False) + del partitioned['at_props'] + del partitioned['common_props'] + if partitioned.get('headerRowCount'): + partitioned['header'] = True + return partitioned + + +def qname2url(qname: str) -> Optional[str]: + """Turn a qname into an http URL by replacing the prefix with the associated URL.""" + for prefix, uri in NAMESPACES.items(): + if qname.startswith(prefix + ':'): + return qname.replace(prefix + ':', uri) + return None + + +def metadata2markdown(tg: 'TableGroup', link_files: bool = False) -> str: + """ + Render the metadata of a dataset as markdown. + + :param link_files: If True, links to data files will be added, assuming the markdown is stored \ + in the same directory as the metadata file. + :return: `str` with markdown formatted text + """ + fname = tg._fname # pylint: disable=W0212 + res = [f"# {tg.common_props.get('dc:title', 'Dataset')}\n"] + if fname and link_files: + res.append(f'> [!NOTE]\n> Described by [{fname.name}]({fname.name}).\n') + + res.append(_properties({k: v for k, v in tg.common_props.items() if k != 'dc:title'})) + + for table in tg.tables: + res.extend(list(_iter_table2markdown(tg, table, link_files))) + return '\n'.join(res) + + +def _qname2link(qname, html=False): # pylint: disable=W0621 + url = qname2url(qname) + if url: + if html: + return f'{qname}' + return f'[{qname}]({url})' + return qname + + +def _htmlify(obj, key=None): + """ + For inclusion in tables we must use HTML for lists. + """ + if isinstance(obj, list): + lis = ''.join(f'
  • {_htmlify(item, key=key)}
  • ' for item in obj) + return f'
      {lis}
    ' + if isinstance(obj, dict): + items = [] + for k, v in obj.items(): + items.append(f'
    {_qname2link(k, html=True)}
    {html.escape(str(v))}
    ') + return f"
    {''.join(items)}
    " + return str(obj) + + +def _properties(props): + def _img(img: Union[str, dict]): + if isinstance(img, str): # pragma: no cover + img = {'https://schema.org/contentUrl': img} + return (f"![{img.get('https://schema.org/caption') or ''}]" + f"({img.get('https://schema.org/contentUrl')})\n") + + props = {k: v for k, v in copy.deepcopy(props).items() if v} + res = [] + desc = props.pop('dc:description', None) + if desc: + res.append(desc + '\n') + img = props.pop('https://schema.org/image', None) + if img: + res.append(_img(img)) + if props: + res.append('property | value\n --- | ---') + for k, v in props.items(): + res.append(f'{_qname2link(k)} | {_htmlify(v, key=k)}') + return '\n'.join(res) + '\n' + + +def _iter_table2markdown(tg, table, link_files): + fks = { + fk.columnReference[0]: (fk.reference.columnReference[0], fk.reference.resource.string) + for fk in table.tableSchema.foreignKeys if len(fk.columnReference) == 1} + header = f'## Table ' + fname = tg._fname # pylint: disable=W0212 + if (link_files and fname and fname.parent.joinpath(table.url.string).exists()): + header += f'[{table.url.string}]({table.url.string})\n' + else: # pragma: no cover + header += table.url.string + yield '\n' + header + '\n' + yield _properties(table.common_props) + dialect = table.inherit('dialect') + if dialect.asdict(): + yield f'\n**CSV dialect**: `{json.dumps(dialect.asdict())}`\n' + yield '\n### Columns\n' + yield 'Name/Property | Datatype | Description' + yield ' --- | --- | --- ' + for col in table.tableSchema.columns: + yield _colrow(col, fks, table.tableSchema.primaryKey) + + +def _colrow(col, fks, pk): + dt = f"`{col.datatype.base if col.datatype else 'string'}`" + if col.datatype: + if col.datatype.format: + if re.fullmatch(r'[\w\s]+(\|[\w\s]+)*', col.datatype.format): + dt += '
    Valid choices:
    ' + dt += ''.join(f' `{w}`' for w in col.datatype.format.split('|')) + elif col.datatype.base == 'string': + dt += f'
    Regex: `{col.datatype.format}`' + if col.datatype.minimum: + dt += f'
    ≥ {col.datatype.minimum}' + if col.datatype.maximum: + dt += f'
    ≤ {col.datatype.maximum}' + if col.separator: + dt = f'list of {dt} (separated by `{col.separator}`)' + desc = col.common_props.get('dc:description', '').replace('\n', ' ') + + if pk and col.name in pk: + desc = (desc + '
    ') if desc else desc + desc += 'Primary key' + + if col.name in fks: + desc = (desc + '
    ') if desc else desc + cname, tname = fks[col.name] + desc += f'References [{tname}::{cname}](#table-{slug(tname)})' + + return ' | '.join([ + f'[{col.name}]({col.propertyUrl})' if col.propertyUrl else f'`{col.name}`', dt, desc]) diff --git a/src/csvw/utils.py b/src/csvw/utils.py index affbfdf..bd13f00 100644 --- a/src/csvw/utils.py +++ b/src/csvw/utils.py @@ -1,61 +1,185 @@ +""" +Misc +""" +import io import re -import copy -import html import json import string import keyword -import pathlib +import logging import warnings +import contextlib import collections +import dataclasses import unicodedata +import urllib.request +from typing import Callable, Any, Union, Optional, Literal -import attr +HTTP_REQUEST_TIMEOUT = 10 -def is_url(s): - return re.match(r'https?://', str(s)) +@dataclasses.dataclass +class LinkHeader: + """ + https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Headers/Link + """ + url: str + params: dict[str, str] + @classmethod + def from_string(cls, s): + """ + ; param1=value1; param2="value2" + """ + comps = re.split(r'>\s*;\s*', s.strip(), maxsplit=1) + if len(comps) == 2: + url, sparams = comps + url += '>' + else: + url, sparams = comps[0], '' + assert url.startswith('<') and url.endswith('>') + url = url[1:-1].strip() + params = {} + if sparams: + for sparam in sparams.split(';'): + key, _, value = sparam.strip().partition('=') + key, value = key.strip(), (value or '').strip() + if value.startswith('"'): + assert value.endswith('"') + value = value[1:-1].strip() + params[key] = value or None + return cls(url=url, params=params) + + @classmethod + def iter_links(cls, s): + """ + A Link header might contain multiple links separated by comma. + """ + for i, single in enumerate(re.split(r',\s*<', s)): + yield cls.from_string(single if i == 0 else '<' + single) -def converter(type_, default, s, allow_none=False, cond=None, allow_list=True): - if allow_list and type_ != list and isinstance(s, list): - return [v for v in [converter(type_, None, ss, cond=cond) for ss in s] if v is not None] - if allow_none and s is None: - return s - if not isinstance(s, type_) or (type_ == int and isinstance(s, bool)) or (cond and not cond(s)): - warnings.warn('Invalid value for property: {}'.format(s)) - return default - return s +@contextlib.contextmanager +def urlopen( + url, + method: Optional[Literal['HEAD', 'GET']] = 'GET', + timeout=HTTP_REQUEST_TIMEOUT, +): + """ + Open URLs + - without raising an exception on HTTP errors, + - passing a specific User-Agent header, + - specifying a timeout. + """ + class NonRaisingHTTPErrorProcessor(urllib.request.HTTPErrorProcessor): + """Don't raise exceptions on HTTP errors.""" + http_response = https_response = lambda self, req, res: res # pylint: disable=C3001 + + opener = urllib.request.build_opener(NonRaisingHTTPErrorProcessor) + opener.addheaders = [('User-agent', 'csvw/4.0.0')] + yield opener.open(urllib.request.Request(url, method=method), timeout=timeout) + + +def request_head(url) -> tuple[str, list[LinkHeader]]: + """Makes a HEAD request and returns the relevant response data.""" + with urlopen(url) as response: + links = [] + for mult in response.info().get_all('Link') or []: + links.extend(LinkHeader.iter_links(mult)) + return response.info().get_content_type() or '', links + + +@dataclasses.dataclass +class GetResponse: + """Relevant data from an HTTP GET response.""" + status_code: int = 200 + content: bytes = None + text: str = None + + def __post_init__(self): + if self.content and not self.text: + self.text = self.content.decode('utf8') + if self.text and not self.content: + self.content = self.text.encode('utf8') + + @classmethod + def from_response(cls, response) -> 'GetResponse': + """Initialize instance with data from a urllib response.""" + content = response.read() + text = content.decode(response.headers.get_content_charset() or 'utf-8') + return cls(status_code=response.status, content=content, text=text) + + def json(self) -> Any: + """The content of the repsonse parsed as JSON.""" + return json.loads(self.text, object_pairs_hook=collections.OrderedDict) + + +def request_get(url: str) -> GetResponse: + """Makes a GET request.""" + with urlopen(url) as response: + return GetResponse.from_response(response) + + +def log_or_raise( + msg: str, + log: Optional[logging.Logger] = None, + level: str = 'warning', + exception_cls: type = ValueError): + """ + Helper for error handling. In an inspection scenario, we want to list - i.e. log - all + errors. In a validation scenario, we raise an exception at the first error. + """ + if log: + getattr(log, level)(msg) + else: + raise exception_cls(msg) -def ensure_path(fname): - if not isinstance(fname, pathlib.Path): - assert isinstance(fname, str) - return pathlib.Path(fname) - return fname +def json_open(filename, mode='r', encoding='utf-8'): + """Open a text file suitable for reading JSON content, i.e. assuming it is utf-8 encoded.""" + assert encoding == 'utf-8' + return io.open(filename, mode, encoding=encoding) -def attr_defaults(cls): - res = collections.OrderedDict() - for field in attr.fields(cls): - default = field.default - if isinstance(default, attr.Factory): - default = default.factory() - res[field.name] = default - return res +def get_json(fname) -> Union[list, dict]: + """Retrieve JSON content from a local file or remote URL.""" + fname = str(fname) + if is_url(fname): + return request_get(fname).json() + with json_open(fname) as f: + return json.load(f, object_pairs_hook=collections.OrderedDict) -def attr_asdict(obj, omit_defaults=True, omit_private=True): - defs = attr_defaults(obj.__class__) - res = collections.OrderedDict() - for field in attr.fields(obj.__class__): - if not (omit_private and field.name.startswith('_')): - value = getattr(obj, field.name) - if not (omit_defaults and value == defs[field.name]): - if hasattr(value, 'asdict'): - value = value.asdict(omit_defaults=True) - res[field.name] = value - return res +def optcast(type_: type) -> Callable[[Any], Any]: + """Returns a callable that casts its argument to type_ unless it is None.""" + return lambda v: v if v is None else type_(v) + + +def is_url(s): # pylint: disable=C0116 + return re.match(r'https?://', str(s)) + + +def type_checker( # pylint: disable=R0913,R0917 + type_: type, + default: Optional[Any], + v: Union[list[Any], Any], + allow_none: bool = False, + cond: Optional[Callable[[Any], bool]] = None, + allow_list=True, +) -> Any: + """Check if a value has a certain type (with bells and whistles), warn if not.""" + if allow_list and type_ != list and isinstance(v, list): + # Convert a list of strings by applying the conversion to each not-None item. + return [v for v in [type_checker(type_, None, vv, cond=cond) for vv in v] if v is not None] + + if allow_none and v is None: + return v + + # Note: `bool` is a `subclass` of int in Python! + if not isinstance(v, type_) or (type_ == int and isinstance(v, bool)) or (cond and not cond(v)): + warnings.warn(f'Invalid value for property: {v}') + return default + return v def normalize_name(s): @@ -103,128 +227,3 @@ def slug(s, remove_whitespace=True, lowercase=True): res = res.encode('ascii', 'ignore').decode('ascii') assert re.match('[ A-Za-z0-9]*$', res) return res - - -def qname2url(qname): - for prefix, uri in { - 'csvw': 'http://www.w3.org/ns/csvw#', - 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', - 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', - 'xsd': 'http://www.w3.org/2001/XMLSchema#', - 'dc': 'http://purl.org/dc/terms/', - 'dcat': 'http://www.w3.org/ns/dcat#', - 'prov': 'http://www.w3.org/ns/prov#', - }.items(): - if qname.startswith(prefix + ':'): - return qname.replace(prefix + ':', uri) - - -def metadata2markdown(tg, link_files=False) -> str: - """ - Render the metadata of a dataset as markdown. - - :param link_files: If True, links to data files will be added, assuming the markdown is stored \ - in the same directory as the metadata file. - :return: `str` with markdown formatted text - """ - def qname2link(qname, html=False): - url = qname2url(qname) - if url: - if html: - return '{}'.format(url, qname) - return '[{}]({})'.format(qname, url) - return qname - - def htmlify(obj, key=None): - """ - For inclusion in tables we must use HTML for lists. - """ - if isinstance(obj, list): - return '
      {}
    '.format( - ''.join('
  • {}
  • '.format(htmlify(item, key=key)) for item in obj)) - if isinstance(obj, dict): - items = [] - for k, v in obj.items(): - items.append('
    {}
    {}
    '.format( - qname2link(k, html=True), html.escape(str(v)))) - return '
    {}
    '.format(''.join(items)) - return str(obj) - - def properties(props): - props = {k: v for k, v in copy.deepcopy(props).items() if v} - res = [] - desc = props.pop('dc:description', None) - if desc: - res.append(desc + '\n') - img = props.pop('https://schema.org/image', None) - if img: - if isinstance(img, str): # pragma: no cover - img = {'contentUrl': img} - res.append('![{}]({})\n'.format( - img.get('https://schema.org/caption') or '', - img.get('https://schema.org/contentUrl'))) - if props: - res.append('property | value\n --- | ---') - for k, v in props.items(): - res.append('{} | {}'.format(qname2link(k), htmlify(v, key=k))) - return '\n'.join(res) + '\n' - - def colrow(col, fks, pk): - dt = '`{}`'.format(col.datatype.base if col.datatype else 'string') - if col.datatype: - if col.datatype.format: - if re.fullmatch(r'[\w\s]+(\|[\w\s]+)*', col.datatype.format): - dt += '
    Valid choices:
    ' - dt += ''.join(' `{}`'.format(w) for w in col.datatype.format.split('|')) - elif col.datatype.base == 'string': - dt += '
    Regex: `{}`'.format(col.datatype.format) - if col.datatype.minimum: - dt += '
    ≥ {}'.format(col.datatype.minimum) - if col.datatype.maximum: - dt += '
    ≤ {}'.format(col.datatype.maximum) - if col.separator: - dt = 'list of {} (separated by `{}`)'.format(dt, col.separator) - desc = col.common_props.get('dc:description', '').replace('\n', ' ') - - if pk and col.name in pk: - desc = (desc + '
    ') if desc else desc - desc += 'Primary key' - - if col.name in fks: - desc = (desc + '
    ') if desc else desc - desc += 'References [{}::{}](#table-{})'.format( - fks[col.name][1], fks[col.name][0], slug(fks[col.name][1])) - - return ' | '.join([ - '[{}]({})'.format(col.name, col.propertyUrl) - if col.propertyUrl else '`{}`'.format(col.name), - dt, - desc, - ]) - - res = ['# {}\n'.format(tg.common_props.get('dc:title', 'Dataset'))] - if tg._fname and link_files: - res.append('> [!NOTE]\n> Described by [{0}]({0}).\n'.format(tg._fname.name)) - - res.append(properties({k: v for k, v in tg.common_props.items() if k != 'dc:title'})) - - for table in tg.tables: - fks = { - fk.columnReference[0]: (fk.reference.columnReference[0], fk.reference.resource.string) - for fk in table.tableSchema.foreignKeys if len(fk.columnReference) == 1} - header = '## Table '.format(slug(table.url.string)) - if link_files and tg._fname and tg._fname.parent.joinpath(table.url.string).exists(): - header += '[{0}]({0})\n'.format(table.url.string) - else: # pragma: no cover - header += table.url.string - res.append('\n' + header + '\n') - res.append(properties(table.common_props)) - dialect = table.inherit('dialect') - if dialect.asdict(): - res.append('\n**CSV dialect**: `{}`\n'.format(json.dumps(dialect.asdict()))) - res.append('\n### Columns\n') - res.append('Name/Property | Datatype | Description') - res.append(' --- | --- | --- ') - for col in table.tableSchema.columns: - res.append(colrow(col, fks, table.tableSchema.primaryKey)) - return '\n'.join(res) diff --git a/tests/conftest.py b/tests/conftest.py index 9991037..52e931b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,28 +2,37 @@ import pathlib import warnings import contextlib +import dataclasses import urllib.parse import urllib.request +from typing import Optional, Literal import pytest -import attr -from csvw.metadata import CSVW, get_json +from csvw.metadata import CSVW +from csvw.utils import get_json, LinkHeader, GetResponse def pytest_addoption(parser): parser.addoption("--number", type=int, help="csvw json test number", default=None) -def csvw_tests_url(path): - return 'http://www.w3.org/2013/csvw/tests/{}'.format(path) +def csvw_tests_url(path) -> str: + return f'http://www.w3.org/2013/csvw/tests/{path}' -def csvw_tests_path(path): +def csvw_tests_path(path) -> pathlib.Path: + """ + We have cloned the csvw test suite locally, to be able to run tests without network. + """ return pathlib.Path(__file__).parent / 'fixtures' / 'csvw' / 'tests' / path def unorder(o): + """ + To make assertions about equality of container instances work, we turn ordered dicts into + regular ones. + """ if isinstance(o, dict): return {k: unorder(v) for k, v in o.items()} if isinstance(o, list): @@ -31,27 +40,53 @@ def unorder(o): return o -@attr.s +@dataclasses.dataclass class CSWVTest: - id = attr.ib(converter=lambda s: s.split('#')[-1]) - type = attr.ib(validator=attr.validators.in_([ + """ + Python object capturing the information of a CSVW test manifest looking like + { + "id": "manifest-json#test001", + "type": "csvt:ToJsonTest", + "name": "Simple table", + "comment": "The simplest possible table without metadata", + "approval": "rdft:Approved", + "option": { + "noProv": true + }, + "action": "test001.csv", + "result": "test001.json" + } + """ + id: str + type: Literal[ 'csvt:NegativeJsonTest', 'csvt:ToJsonTest', 'csvt:ToJsonTestWithWarnings', 'csvt:PositiveValidationTest', 'csvt:NegativeValidationTest', - 'csvt:WarningValidationTest', - ])) - name = attr.ib() - comment = attr.ib() - approval = attr.ib() - option = attr.ib( - converter=lambda d: {k: csvw_tests_url(v) if k == 'metadata' else v for k, v in d.items()}) - action = attr.ib(converter=lambda s: csvw_tests_url(s)) - result = attr.ib(converter=lambda s: csvw_tests_url(s) if s else None, default=None) - implicit = attr.ib(default=None) - httpLink = attr.ib(default=None) - contentType = attr.ib(default=None) + 'csvt:WarningValidationTest'] + name: str + comment: str + approval: str + option: dict + action: str + result: Optional[str] = None + implicit: str = None + httpLink: str = None + contentType: str = None + + def __post_init__(self): + self.id = self.id.split('#')[-1] + self.option = { + k: csvw_tests_url(v) if k == 'metadata' else v for k, v in self.option.items()} + self.action = csvw_tests_url(self.action) + self.result = csvw_tests_url(self.result) if self.result else None + + @property + def csvw_instance(self) -> CSVW: + return CSVW(self.action, + md_url=self.option.get('metadata'), + validate=self.is_validation_test) @property def is_json_test(self): @@ -65,7 +100,43 @@ def is_validation_test(self): def number(self): # pragma: no cover return int(self.id.replace('test', '')) - def _run(self): + def request_head(self, _): + """ + Used to patch `utils.request_head` in order to run tests without actual HTTP requests. + """ + if self.contentType: + return self.contentType, [] + if self.httpLink: + return '', [LinkHeader.from_string(self.httpLink)] + return '', [] + + @staticmethod + def request_get(url): + """ + Used to patch `utils.request_get` in order to run tests without actual HTTP requests. + """ + url = urllib.parse.urlparse(url) + if url.netloc == 'www.w3.org': + if url.path.startswith('/2013/csvw/tests/'): + p = csvw_tests_path(url.path.replace('/2013/csvw/tests/', '')) + if p.exists(): + return GetResponse(content=p.read_bytes()) + elif url.path == '/.well-known/csvm': + return GetResponse(text="""{+url}-metadata.json +csv-metadata.json +{+url}.json +csvm.json +""") + return GetResponse(status_code=404) + raise ValueError(url) # pragma: no cover + + def run(self, mocker): + # Mock HTTP requests: + mocker.patch('csvw.metadata.utils.request_head', self.request_head) + mocker.patch('csvw.metadata.utils.request_get', self.request_get) + mocker.patch('csvw.utils.request_get', self.request_get) + + # Prepare the context for running the test: with contextlib.ExitStack() as stack: if self.type == "csvt:ToJsonTestWithWarnings": stack.enter_context(pytest.warns(UserWarning)) @@ -89,8 +160,8 @@ def _run(self): elif self.type == "csvt:WarningValidationTest": stack.enter_context(pytest.warns(UserWarning)) - ds = CSVW( - self.action, md_url=self.option.get('metadata'), validate=self.is_validation_test) + ds = self.csvw_instance + if self.is_validation_test: if self.type == 'csvt:PositiveValidationTest': assert ds.is_valid @@ -103,57 +174,32 @@ def _run(self): elif self.is_json_test: assert unorder(ds.to_json(minimal=self.option.get('minimal'))) == \ - unorder(get_json(self.result)), \ - '{}: {}'.format(self.id, self.name) - - def run(self): - import requests_mock - - def text_callback(request, context): - url = urllib.parse.urlparse(request.url) - if url.netloc == 'www.w3.org': - if url.path.startswith('/2013/csvw/tests/'): - p = csvw_tests_path(url.path.replace('/2013/csvw/tests/', '')) - if p.exists(): - context.status_code = 200 - return p.read_text(encoding='utf8') - elif url.path == '/.well-known/csvm': - context.status_code = 200 - return """{+url}-metadata.json -csv-metadata.json -{+url}.json -csvm.json -""" - context.status_code = 404 - return '' - raise ValueError(request.url) # pragma: no cover - - with requests_mock.Mocker() as mock: - if self.contentType: - mock.head(self.action, text='', headers={'Content-Type': self.contentType}) - elif self.httpLink: - mock.head(self.action, text='', headers={'Link': self.httpLink}) - else: - mock.head(self.action, text='', headers={}) - mock.get(requests_mock.ANY, text=text_callback) - self._run() + unorder(get_json(self.result)), f'{self.id}: {self.name}' def pytest_generate_tests(metafunc): + def iter_tests(manifest, cond, xfail): + for t in json.loads(csvw_tests_path(manifest).read_text(encoding='utf8'))['entries']: + test = CSWVTest(**t) + if cond(test): + if test.number in xfail: + yield pytest.param(test, marks=pytest.mark.xfail) + else: + yield test + + # We xfail some tests, which test ambiguous parts of the spec, or require behaviour which seems + # overly complex to implement. if "csvwjsontest" in metafunc.fixturenames: + number = metafunc.config.getoption("number") + testname = "csvwjsontest" xfail = { 193: "Why do we have to format durations with particular comps, e.g. PT130M and not " "PT2H10M?", } - number = metafunc.config.getoption("number") - tests = json.loads(csvw_tests_path('manifest-json.jsonld').read_text(encoding='utf8')) - metafunc.parametrize( - "csvwjsontest", - [pytest.param(test, marks=pytest.mark.xfail) if test.number in xfail else test - for test in [CSWVTest(**t) for t in tests['entries']] - if number is None or number == test.number]) - - if "csvwnonnormtest" in metafunc.fixturenames: + manifest = 'manifest-json.jsonld' + condition = lambda t: number is None or number == t.number + elif "csvwnonnormtest" in metafunc.fixturenames: + testname = "csvwnonnormtest" xfail = { 20: "Don't understand the test.", 21: "Don't understand the test. If not trimming makes reading the data impossible, " @@ -165,22 +211,19 @@ def pytest_generate_tests(metafunc): 58: "Again, the trimming seems to not be expected?", 59: "Again, the trimming seems to not be expected?", } - tests = json.loads(csvw_tests_path('manifest-nonnorm.jsonld').read_text(encoding='utf8')) - metafunc.parametrize( - "csvwnonnormtest", - [pytest.param(test, marks=pytest.mark.xfail) if test.number in xfail else test - for test in [CSWVTest(**t) for t in tests['entries']] if 'Json' in test.type]) - - if "csvwvalidationtest" in metafunc.fixturenames: + manifest = 'manifest-nonnorm.jsonld' + condition = lambda t: 'Json' in t.type + elif "csvwvalidationtest" in metafunc.fixturenames: + testname = "csvwvalidationtest" xfail = { 92: "Can't detect malformed JSON if we don't know whether we are fed a metadata or a " "CSV file to begin with!", 124: "Hm. Didn't we have this as ToJson test with warnings?", } + manifest = 'manifest-validation.jsonld' number = metafunc.config.getoption("number") - tests = json.loads(csvw_tests_path('manifest-validation.jsonld').read_text(encoding='utf8')) - metafunc.parametrize( - "csvwvalidationtest", - [pytest.param(test, marks=pytest.mark.xfail) if test.number in xfail else test - for test in [CSWVTest(**t) for t in tests['entries']] - if number is None or number == test.number]) + condition = lambda t: number is None or number == t.number + else: + return + + metafunc.parametrize(testname, list(iter_tests(manifest, condition, xfail))) diff --git a/tests/test_cli.py b/tests/test_cli.py index d4ded1b..bb1eecb 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,12 +1,13 @@ import os import json -import shutil import pathlib +import sqlite3 import argparse import pytest -from csvw.__main__ import csvw2json, csvw2datasette, csvwdescribe, csvwvalidate, csvw2markdown +from csvw.__main__ import ( + csvw2json, csvw2datasette, csvwdescribe, csvwvalidate, csvw2markdown, csvw2sqlite) def relpath(fname): @@ -82,3 +83,14 @@ def test_csvwdescribe(csvname, tsvname, capsys): def test_csvw2datasette(tmp_path, mdname): run(csvw2datasette, url=mdname, outdir=tmp_path) assert tmp_path.joinpath('datasette.db').exists() + + +def test_csvw2sqlite(tmp_path, mdname): + out = tmp_path / 'db.sqlite' + run(csvw2sqlite, url=mdname, output=out) + assert out.exists() + conn = sqlite3.connect(out) + cu = conn.cursor() + cu.execute('select count(*) from `csv.txt`') + assert cu.fetchone()[0] == 2 + conn.close() diff --git a/tests/test_conformance.py b/tests/test_conformance.py index b72833b..e96231f 100644 --- a/tests/test_conformance.py +++ b/tests/test_conformance.py @@ -6,18 +6,18 @@ @pytest.mark.conformance -def test_csvw_json(csvwjsontest): - csvwjsontest.run() +def test_csvw_json(csvwjsontest, mocker): + csvwjsontest.run(mocker) @pytest.mark.conformance -def test_csvw_nonnorm(csvwnonnormtest): - csvwnonnormtest.run() +def test_csvw_nonnorm(csvwnonnormtest, mocker): + csvwnonnormtest.run(mocker) @pytest.mark.conformance -def test_csvw_validation(csvwvalidationtest): - csvwvalidationtest.run() +def test_csvw_validation(csvwvalidationtest, mocker): + csvwvalidationtest.run(mocker) def test_prefix_in_property_url(): diff --git a/tests/test_datatypes.py b/tests/test_datatypes.py index c1fa6bd..0becfb2 100644 --- a/tests/test_datatypes.py +++ b/tests/test_datatypes.py @@ -81,9 +81,28 @@ def test_roundtrip(datatype, val, obj, roundtrip): assert t.formatted(o) == val +@pytest.mark.parametrize( + 'spec', + [ + {'base': 'string', 'length': 5, 'minLength': 6}, + {'base': 'string', 'length': 5, 'maxLength': 4}, + {'base': 'string', 'maxLength': 5, 'minLength': 6}, + 5, + {'base': 'datetime', 'format': 'd.M.yyyy HH:mm:ss.SGS'}, + {'base': 'datetime', 'format': 'd.M.yyyy HH:mm:ss.S XxX'}, + {'base': 'dateTimeStamp', 'format': 'd.M.yyyy HH:mm:ss.SSS'}, + ] +) +def test_invalid_spec(spec): + with pytest.raises(ValueError): + Datatype.fromvalue(spec) + + @pytest.mark.parametrize( 'datatype,val', [ + ({'base': 'string', 'maxLength': 4}, 'abcdefg'), + ({'base': 'string', 'minLength': 4}, 'abc'), ({'base': 'nonNegativeInteger'}, '-1'), ({'base': 'positiveInteger'}, '0'), ({'base': 'double', 'minimum': 10}, '3.1'), @@ -100,7 +119,7 @@ def test_roundtrip(datatype, val, obj, roundtrip): ({'base': 'hexBinary'}, 'spam'), ] ) -def test_invalid(datatype, val): +def test_invalid_value(datatype, val): t = Datatype.fromvalue(datatype) with pytest.raises(ValueError): t.read(val) @@ -133,7 +152,7 @@ def test_number(): assert t.formatted(v) == '3' with pytest.raises(ValueError): t.validate(12) - + t = Datatype.fromvalue( {'base': 'decimal', 'format': {'groupChar': '.', 'decimalChar': ','}}) with warnings.catch_warnings(): @@ -152,38 +171,10 @@ def test_number(): assert t.formatted(decimal.Decimal('-3.1415')) == '3,14-' -def test_errors(): - with pytest.raises(ValueError): - Datatype.fromvalue({'base': 'string', 'length': 5, 'minLength': 6}) - - with pytest.raises(ValueError): - Datatype.fromvalue({'base': 'string', 'length': 5, 'maxLength': 4}) - - with pytest.raises(ValueError): - dt = Datatype.fromvalue({'base': 'string', 'minLength': 4}) - dt.validate('abc') - - with pytest.raises(ValueError): - dt = Datatype.fromvalue({'base': 'string', 'maxLength': 4}) - dt.validate('abcdefg') - - with pytest.raises(ValueError): - Datatype.fromvalue({'base': 'string', 'maxLength': 5, 'minLength': 6}) - - with pytest.raises(ValueError): - Datatype.fromvalue(5) - - def test_date(): with pytest.warns(UserWarning): Datatype.fromvalue({'base': 'date', 'format': '2012+12+12'}) - with pytest.raises(ValueError): - Datatype.fromvalue({'base': 'datetime', 'format': 'd.M.yyyy HH:mm:ss.SGS'}) - - with pytest.raises(ValueError): - Datatype.fromvalue({'base': 'datetime', 'format': 'd.M.yyyy HH:mm:ss.S XxX'}) - t = Datatype.fromvalue({'base': 'datetime', 'format': 'd.M.yyyy HH:mm:ss.SSS'}) assert t.formatted(datetime.datetime(2012, 12, 12, 12, 12, 12, microsecond=12345)) == \ '12.12.2012 12:12:12.012' @@ -203,9 +194,6 @@ def test_date(): assert t.formatted(t.parse('2012-12-01T12:12:12.123456+05:30')) == \ '2012-12-01T12:12:12.123456+05:30' - with pytest.raises(ValueError): - Datatype.fromvalue({'base': 'dateTimeStamp', 'format': 'd.M.yyyy HH:mm:ss.SSS'}) - t = Datatype.fromvalue({'base': 'duration', 'format': 'P[1-5]Y'}) with pytest.raises(ValueError): t.parse('P8Y') diff --git a/tests/test_db.py b/tests/test_db.py index d134741..bf5cb98 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -15,14 +15,7 @@ @pytest.fixture def tg(): - return TableGroup.fromvalue({'tables': [ - { - 'url': 'data', - 'tableSchema': { - 'columns': [] - } - } - ]}) + return TableGroup.fromvalue({'tables': [{'url': 'data', 'tableSchema': {'columns': []}}]}) @pytest.fixture @@ -244,6 +237,25 @@ def test_many_to_many(tg_with_foreign_keys): db._connection.close() +def test_many_to_many_2(tg_with_foreign_keys): + db = Database(tg_with_foreign_keys) + db.write( + ref=[ + {'pk': '1', 'ref1': ['y', 'x']}, + {'pk': '2', 'ref1': ['x']}, + ], + data=[{'v': 'x'}, {'v': 'y'}]) + + res = db.read()['ref'][0] + # Associations between the same pair of tables are grouped by foreign key column: + assert res['ref1'] == ['y', 'x'] + assert res['ref2'] == [] + res = db.read()['ref'][1] + # Associations between the same pair of tables are grouped by foreign key column: + assert res['ref1'] == ['x'] + db._connection.close() + + def test_many_to_many_no_context(tg_with_foreign_keys): class DatabaseWithoutContext(Database): def association_table_context(self, table, column, fkey): diff --git a/tests/test_frictionless.py b/tests/test_frictionless.py index 0d1a8ea..75a8af2 100644 --- a/tests/test_frictionless.py +++ b/tests/test_frictionless.py @@ -43,10 +43,10 @@ def test_DataPackage_init(): warnings.simplefilter('ignore') dp = DataPackage(dict(resources=[], name='x')) dp = DataPackage(dp) - assert dp.to_tablegroup().common_props['dc:identifier'] == 'x' + assert dp.to_tablegroup(TableGroup).common_props['dc:identifier'] == 'x' dp = DataPackage('{"resources": [], "name": "x", "id": "y"}') - assert dp.to_tablegroup().common_props['dc:identifier'] == 'y' - assert dp.to_tablegroup().common_props['dc:title'] == 'x' + assert dp.to_tablegroup(TableGroup).common_props['dc:identifier'] == 'y' + assert dp.to_tablegroup(TableGroup).common_props['dc:title'] == 'x' def test_DataPackage_constraints(datafactory): @@ -54,23 +54,23 @@ def test_DataPackage_constraints(datafactory): warnings.simplefilter('ignore') dp = datafactory([{'name': 'col', 'constraints': {'maxLength': 3}}], [['abcd']]) with pytest.raises(ValueError): - _ = list(DataPackage(dp).to_tablegroup().tables[0]) + _ = list(DataPackage(dp).to_tablegroup(TableGroup).tables[0]) dp = datafactory([{'name': 'col', 'constraints': {'pattern': '[a-z]{2}'}}], [['abcd']]) with pytest.raises(ValueError): - _ = list(DataPackage(dp).to_tablegroup().tables[0]) + _ = list(DataPackage(dp).to_tablegroup(TableGroup).tables[0]) dp = datafactory( [{'name': 'col', 'type': 'year', 'constraints': {'pattern': '[2].*'}}], [['1990']]) with pytest.raises(ValueError): - _ = list(DataPackage(dp).to_tablegroup().tables[0]) + _ = list(DataPackage(dp).to_tablegroup(TableGroup).tables[0]) def test_DataPackage(tmpfixtures): with warnings.catch_warnings(): warnings.simplefilter('ignore') dp = DataPackage(tmpfixtures / 'datapackage.json') - tg = dp.to_tablegroup() + tg = dp.to_tablegroup(TableGroup) rows = list(tg.tables[0]) assert len(rows) == 9 assert rows[-1]['Year'] == 2012 diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 292e444..5faad2d 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -8,12 +8,13 @@ import warnings import collections -from csvw.metadata import json_open +from csvw.utils import json_open import pytest import csvw from csvw.dsv import Dialect +from csvw.utils import GetResponse FIXTURES = pathlib.Path(__file__).parent / 'fixtures' @@ -41,7 +42,7 @@ def test_Link(link, base, res): assert csvw.Link(link).resolve(base) == res -class TestColumnEquality(object): +class TestColumnEquality: def test_get_column(self): t1 = csvw.Table.fromvalue({ @@ -57,7 +58,7 @@ def test_get_column(self): assert t1.tableSchema.columns[0] == t2.tableSchema.columns[0] -class TestColumnAccess(object): +class TestColumnAccess: def test_get_column(self): t = csvw.Table.fromvalue({ @@ -76,7 +77,7 @@ def test_get_column(self): assert t.get_column('xyz').name is None -class TestDialect(object): +class TestDialect: @staticmethod def _roundtrip(t, fpath, *items): @@ -104,19 +105,19 @@ def test_doubleQuote(self, tmp_path): t.dialect.doubleQuote = False c, res = self._roundtrip(t, fpath, {"col1": "", "col2": value}) assert r'\"a\\\\b\\c\\\"d' in c - assert res[0]['col2'] == value + assert res[0]['col2'] == value # pragma: no cover - t.dialect.quoteChar = '*' - c, res = self._roundtrip(t, fpath, {"col1": "", "col2": value}) - assert res[0]['col2'] == value + t.dialect.quoteChar = '*' # pragma: no cover + c, res = self._roundtrip(t, fpath, {"col1": "", "col2": value}) # pragma: no cover + assert res[0]['col2'] == value # pragma: no cover - t.dialect.doubleQuote = True - c, res = self._roundtrip(t, fpath, {"col1": "", "col2": value}) - assert res[0]['col2'] == value + t.dialect.doubleQuote = True # pragma: no cover + c, res = self._roundtrip(t, fpath, {"col1": "", "col2": value}) # pragma: no cover + assert res[0]['col2'] == value # pragma: no cover - value = value.replace('"', '*') - c, res = self._roundtrip(t, fpath, {"col1": "", "col2": value}) - assert res[0]['col2'] == value + value = value.replace('"', '*') # pragma: no cover + c, res = self._roundtrip(t, fpath, {"col1": "", "col2": value}) # pragma: no cover + assert res[0]['col2'] == value # pragma: no cover @pytest.mark.xfail(reason='commentPrefix is checked only after csv.reader has parsed the line') def test_commentPrefix(self, tmp_path): @@ -134,7 +135,7 @@ def test_commentPrefix(self, tmp_path): assert res[0]['col1'] == '$val' -class TestNaturalLanguage(object): +class TestNaturalLanguage: def test_string(self): l = csvw.NaturalLanguage('abc') @@ -168,7 +169,7 @@ def test_serialize(self): '{"und": ["\\u00e4", "a"], "de": "\\u00f6"}' -class TestColumn(object): +class TestColumn: def test_read_rite_with_separator(self): col = csvw.Column.fromvalue({'separator': ';', 'null': 'nn'}) @@ -239,7 +240,7 @@ def _load_json(path): return json.load(f) -class TestTable(object): +class TestTable: @staticmethod def _make_table(tmp_path, data=None, metadata=None): @@ -300,7 +301,7 @@ def test_unspecified_column_in_table_without_url(self, tmp_path): list(t.iterdicts(fname=str(data))) -class TestTableGroup(object): +class TestTableGroup: @staticmethod def _make_tablegroup(tmp_path, data=None, metadata=None): @@ -759,22 +760,20 @@ def test_foreignkeys_2(self, tmp_path): with pytest.raises(ValueError): tg.check_referential_integrity() - def test_remote_schema(self, tmp_path): - import requests_mock - - with requests_mock.Mocker() as m: - schema = """ - {"columns": [ - {"name": "countryCode", "datatype": "string"}, - {"name": "name", "datatype": "string"}]} - """ - m.get("http://example.com/schema", content=schema.encode('utf8')) - tg = self._make_tablegroup( - tmp_path, - metadata="""{ + def test_remote_schema(self, tmp_path, mocker): + def request_get(url): + return GetResponse(text="""\ +{"columns": [ + {"name": "countryCode", "datatype": "string"}, + {"name": "name", "datatype": "string"}]}""") + + mocker.patch('csvw.metadata.utils.request_get', request_get) + tg = self._make_tablegroup( + tmp_path, + metadata="""{ "@context": "http://www.w3.org/ns/csvw", "tables": [{"url": "countries.csv", "tableSchema": "http://example.com/schema"}]}""") - assert len(tg.tables[0].tableSchema.columns) == 2 + assert len(tg.tables[0].tableSchema.columns) == 2 # The remote content has been inlined: out = tmp_path / 'md.json' @@ -824,20 +823,13 @@ def test_zip_support(tmp_path): assert len(list(csvw.TableGroup.from_file(out.parent / 'md.json').tables[0])) == 4 -def test_from_url(): - import requests_mock - - def content(req, ctx): - ctx.status_code = 200 - return FIXTURES.joinpath(req.url.split('/')[-1]).read_bytes() - - with requests_mock.Mocker() as m: - m.get( - requests_mock.ANY, - content=content) +def test_from_url(mocker): + def request_get(url): + return GetResponse(content=FIXTURES.joinpath(url.split('/')[-1]).read_bytes()) - t = csvw.Table.from_file('http://example.com/csv.txt-table-metadata.json') - assert len(list(t)) == 2 + mocker.patch('csvw.utils.request_get', request_get) + t = csvw.Table.from_file('http://example.com/csv.txt-table-metadata.json') + assert len(list(t)) == 2 def test_datatype_limits(tmp_path): diff --git a/tests/test_metadata_utils.py b/tests/test_metadata_utils.py new file mode 100644 index 0000000..26adbfd --- /dev/null +++ b/tests/test_metadata_utils.py @@ -0,0 +1,30 @@ +import dataclasses + +import pytest + +from csvw.metadata_utils import * + + +@pytest.fixture +def Dataclass(): + @dataclasses.dataclass + class Test: + _private: int = 5 + public: str = 'hello' + + return Test + + +@pytest.mark.parametrize( + 'data,kw,expected', + [ + (dict(), dict(), {}), + (dict(), dict(omit_defaults=False), {'public': 'hello'}), + (dict(), dict(omit_defaults=False, omit_private=False), {'public': 'hello', '_private': 5}), + (dict(), dict(omit_private=False), {}), + (dict(_private=1), dict(omit_private=False), {'_private': 1}), + (dict(public='world'), dict(), {'public': 'world'}), + ] +) +def test_dataclass_asdict(Dataclass, data, kw, expected): + assert dataclass_asdict(Dataclass(**data), **kw) == expected diff --git a/tests/test_utils.py b/tests/test_utils.py index d5b25f0..e564910 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,10 +1,70 @@ -import pathlib +import os +import urllib.error +import contextlib + +import pytest from csvw import utils -def test_ensure_path(): - assert isinstance(utils.ensure_path('test.csv'), pathlib.Path) +@pytest.mark.parametrize( + 's,check', + [ + ('', lambda lh: lh.params == {} and lh.url == 'url'), + ('; p=5', lambda lh: lh.params['p'] == '5'), + ] +) +def test_LinkHeader(s, check): + assert check(utils.LinkHeader.from_string(s)) + + +def test_LinkHeader_mult(): + res = list(utils.LinkHeader.iter_links(', ')) + assert len(res) == 2 + assert res[0].url == 'url1' and res[1].url == 'url2' + + +def test_urlopen(): + if os.getenv("GITHUB_ACTIONS"): + return # pragma: no cover + try: + with utils.urlopen('https://httpbin.org/delay/2', timeout=0.01) as res: + assert res.status in (404, 201) # pragma: no cover + except urllib.error.URLError as e: + assert ('timed out' in str(e)) or ('failure in name resolution' in str(e)) + + +def test_request_get(mocker): + @contextlib.contextmanager + def urlopen(url): + yield mocker.Mock( + read=lambda: '"äöü"'.encode('latin1'), + status=201, + headers=mocker.Mock(get_content_charset=lambda: 'latin1') + ) + + mocker.patch('csvw.utils.urlopen', urlopen) + res = utils.request_get('url') + assert res.text == '"äöü"' + assert res.json() == "äöü" + + +def test_request_head(mocker): + class HTTPMessage: + @staticmethod + def info(): + return mocker.Mock( + get_all=lambda _: ['', ''], + get_content_type=lambda: 'text/html') + + @contextlib.contextmanager + def urlopen(url): + yield HTTPMessage() + + mocker.patch('csvw.utils.urlopen', urlopen) + content_type, links = utils.request_head('url') + assert content_type == 'text/html' + assert len(links) == 2 def test_normalize_name():