diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 000000000000..9bdf662f2884 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,180 @@ +name: CI + +on: + workflow_dispatch: + +jobs: + build-grpc: + runs-on: ubuntu-latest + env: + ARTIFACTORY_URL: 'relativityone.jfrog.io/artifactory' + ARTIFACTORY_USERNAME: ${{ vars.ARTIFACTORY_USERNAME }} + ARTIFACTORY_PASSWORD: ${{ secrets.ARTIFACTORY_PASSWORD }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.16.x' + + - name: Set GOROOT environment variable + run: echo "GOROOT=$(go env GOROOT)" >> $GITHUB_ENV + + - name: Set up JDK + uses: actions/setup-java@v4 + with: + distribution: 'adopt' + java-version: 11 + + - uses: gradle/actions/setup-gradle@v3 + + - name: Build vendor grpc + run: | + cd vendor/grpc-1_36_0 && ../../gradlew shadowJar + + - name: Publish vendor grpc + run: | + cd vendor/grpc-1_36_0 && ../../gradlew -PvendoredDependenciesOnly publish + + build-core: + runs-on: ubuntu-latest + env: + ARTIFACTORY_URL: 'relativityone.jfrog.io/artifactory' + ARTIFACTORY_USERNAME: ${{ vars.ARTIFACTORY_USERNAME }} + ARTIFACTORY_PASSWORD: ${{ secrets.ARTIFACTORY_PASSWORD }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.16.x' + + - name: Set GOROOT environment variable + run: echo "GOROOT=$(go env GOROOT)" >> $GITHUB_ENV + + - name: Set up JDK + uses: actions/setup-java@v4 + with: + distribution: 'adopt' + java-version: 11 + + - uses: gradle/actions/setup-gradle@v3 + + - name: Build Java Core SDK + run: | + ./gradlew :sdks:java:core:build -x test -x checkstyleMain -x checkstyleTest + + - name: Publish Java Core SDK + run: | + ./gradlew :sdks:java:core:publish -Ppublishing + + build-mongodb: + runs-on: ubuntu-latest + env: + ARTIFACTORY_URL: 'relativityone.jfrog.io/artifactory' + ARTIFACTORY_USERNAME: ${{ vars.ARTIFACTORY_USERNAME }} + ARTIFACTORY_PASSWORD: ${{ secrets.ARTIFACTORY_PASSWORD }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.16.x' + + - name: Set GOROOT environment variable + run: echo "GOROOT=$(go env GOROOT)" >> $GITHUB_ENV + + - name: Set up JDK + uses: actions/setup-java@v4 + with: + distribution: 'adopt' + java-version: 11 + + - uses: gradle/actions/setup-gradle@v3 + + - name: Build MongoDB IO module + run: | + ./gradlew :sdks:java:io:mongodb:build -x test + + - name: Publish MongoDB IO module + run: | + ./gradlew :sdks:java:io:mongodb:publish -Ppublishing + + build-model-pipeline: + runs-on: ubuntu-latest + env: + ARTIFACTORY_URL: 'relativityone.jfrog.io/artifactory' + ARTIFACTORY_USERNAME: ${{ vars.ARTIFACTORY_USERNAME }} + ARTIFACTORY_PASSWORD: ${{ secrets.ARTIFACTORY_PASSWORD }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.16.x' + + - name: Set GOROOT environment variable + run: echo "GOROOT=$(go env GOROOT)" >> $GITHUB_ENV + + - name: Set up JDK + uses: actions/setup-java@v4 + with: + distribution: 'adopt' + java-version: 11 + + - uses: gradle/actions/setup-gradle@v3 + + - name: Build Model Pipeline module + run: | + ./gradlew :model:pipeline:build -x test + + - name: Publish Model Pipeline module + run: | + ./gradlew :model:pipeline:publish -Ppublishing + + build-job-management: + runs-on: ubuntu-latest + env: + ARTIFACTORY_URL: 'relativityone.jfrog.io/artifactory' + ARTIFACTORY_USERNAME: ${{ vars.ARTIFACTORY_USERNAME }} + ARTIFACTORY_PASSWORD: ${{ secrets.ARTIFACTORY_PASSWORD }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.16.x' + + - name: Set GOROOT environment variable + run: echo "GOROOT=$(go env GOROOT)" >> $GITHUB_ENV + + - name: Set up JDK + uses: actions/setup-java@v4 + with: + distribution: 'adopt' + java-version: 11 + + - uses: gradle/actions/setup-gradle@v3 + + - name: Build Job Management module + run: | + ./gradlew :model:job-management:build -x test + + - name: Publish Job Management module + run: | + ./gradlew :model:job-management:publish -Ppublishing \ No newline at end of file diff --git a/CHANGES.md b/CHANGES.md index 089ee9ad3ee2..b33abce20563 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -45,55 +45,57 @@ * Fixed X (Java/Python) ([BEAM-X](https://issues.apache.org/jira/browse/BEAM-X)). --> + # [2.33.X] - Unreleased ## Highlights -* New highly anticipated feature X added to Python SDK ([BEAM-X](https://issues.apache.org/jira/browse/BEAM-X)). -* New highly anticipated feature Y added to Java SDK ([BEAM-Y](https://issues.apache.org/jira/browse/BEAM-Y)). -* Go SDK is no longer experimental, and is officially part of the Beam release process. - * Matching Go SDK containers are published on release. - * Batch usage is well supported, and tested on Flink, Spark, and the Python Portable Runner. - * SDK Tests are also run against Google Cloud Dataflow, but this doesn't indicate reciprical support. - * The SDK supports Splittable DoFns, Cross Language transforms, and most Beam Model basics. - * Go Modules are now used for dependency management. - * This is a breaking change, see Breaking Changes for resolution. - * Easier path to contribute to the Go SDK, no need to set up a GO_PATH. - * Minimum Go version is now Go v1.16 - * See the announcement blogpost for full information (TODO(lostluck): Add link once published.) +- New highly anticipated feature X added to Python SDK ([BEAM-X](https://issues.apache.org/jira/browse/BEAM-X)). +- New highly anticipated feature Y added to Java SDK ([BEAM-Y](https://issues.apache.org/jira/browse/BEAM-Y)). +- Go SDK is no longer experimental, and is officially part of the Beam release process. + - Matching Go SDK containers are published on release. + - Batch usage is well supported, and tested on Flink, Spark, and the Python Portable Runner. + - SDK Tests are also run against Google Cloud Dataflow, but this doesn't indicate reciprical support. + - The SDK supports Splittable DoFns, Cross Language transforms, and most Beam Model basics. + - Go Modules are now used for dependency management. + - This is a breaking change, see Breaking Changes for resolution. + - Easier path to contribute to the Go SDK, no need to set up a GO_PATH. + - Minimum Go version is now Go v1.16 + - See the announcement blogpost for full information (TODO(lostluck): Add link once published.) ## I/Os -* Support for X source added (Java/Python) ([BEAM-X](https://issues.apache.org/jira/browse/BEAM-X)). +- Support for X source added (Java/Python) ([BEAM-X](https://issues.apache.org/jira/browse/BEAM-X)). ## New Features / Improvements -* X feature added (Java/Python) ([BEAM-X](https://issues.apache.org/jira/browse/BEAM-X)). -* Upgrade Flink runner to Flink versions 1.13.2, 1.12.5 and 1.11.4 ([BEAM-10955](https://issues.apache.org/jira/browse/BEAM-10955)). +- X feature added (Java/Python) ([BEAM-X](https://issues.apache.org/jira/browse/BEAM-X)). +- Upgrade Flink runner to Flink versions 1.13.2, 1.12.5 and 1.11.4 ([BEAM-10955](https://issues.apache.org/jira/browse/BEAM-10955)). ## Breaking Changes -* Go SDK pipelines require new import paths to use this release due to migration to Go Modules. - * `go.mod` files will need to change to require `github.com/apache/beam/sdks/v2`. - * Code depending on beam imports need to include v2 on the module path. - * Fix by'v2' to the import paths, turning `.../sdks/go/...` to `.../sdks/v2/go/...` - * No other code change should be required to use v2.33.0 of the Go SDK. +- Go SDK pipelines require new import paths to use this release due to migration to Go Modules. + - `go.mod` files will need to change to require `github.com/apache/beam/sdks/v2`. + - Code depending on beam imports need to include v2 on the module path. + - Fix by'v2' to the import paths, turning `.../sdks/go/...` to `.../sdks/v2/go/...` + - No other code change should be required to use v2.33.1 of the Go SDK. ## Deprecations -* Python GBK will stop supporting unbounded PCollections that have global windowing and a default trigger in Beam 2.34. This can be overriden with `--allow_unsafe_triggers`. ([BEAM-9487](https://issues.apache.org/jira/browse/BEAM-9487)). -* Python GBK will start requiring safe triggers or the `--allow_unsafe_triggers` flag starting with Beam 2.34. ([BEAM-9487](https://issues.apache.org/jira/browse/BEAM-9487)). +- Python GBK will stop supporting unbounded PCollections that have global windowing and a default trigger in Beam 2.34. This can be overriden with `--allow_unsafe_triggers`. ([BEAM-9487](https://issues.apache.org/jira/browse/BEAM-9487)). +- Python GBK will start requiring safe triggers or the `--allow_unsafe_triggers` flag starting with Beam 2.34. ([BEAM-9487](https://issues.apache.org/jira/browse/BEAM-9487)). ## Known Issues -* Fixed X (Java/Python) ([BEAM-X](https://issues.apache.org/jira/browse/BEAM-X)). +- Fixed X (Java/Python) ([BEAM-X](https://issues.apache.org/jira/browse/BEAM-X)). # [2.32.X] - Unreleased ## Highlights -* New highly anticipated feature X added to Python SDK ([BEAM-X](https://issues.apache.org/jira/browse/BEAM-X)). -* New highly anticipated feature Y added to Java SDK ([BEAM-Y](https://issues.apache.org/jira/browse/BEAM-Y)). -* The [Beam DataFrame + +- New highly anticipated feature X added to Python SDK ([BEAM-X](https://issues.apache.org/jira/browse/BEAM-X)). +- New highly anticipated feature Y added to Java SDK ([BEAM-Y](https://issues.apache.org/jira/browse/BEAM-Y)). +- The [Beam DataFrame API](https://beam.apache.org/documentation/dsls/dataframes/overview/) is no longer experimental! We've spent the time since the [2.32.0 preview announcement](https://beam.apache.org/blog/dataframe-api-preview-available/) @@ -112,254 +114,256 @@ the API, guided by your [feedback](https://beam.apache.org/community/contact-us/). - ## I/Os -* Support for X source added (Java/Python) ([BEAM-X](https://issues.apache.org/jira/browse/BEAM-X)). -* Added ability to use JdbcIO.Write.withResults without statement and preparedStatementSetter. ([BEAM-12511](https://issues.apache.org/jira/browse/BEAM-12511)) -- Added ability to register URI schemes to use the S3 protocol via FileIO. ([BEAM-12435](https://issues.apache.org/jira/browse/BEAM-12435)). -* Respect number of shards set in SnowflakeWrite batch mode. ([BEAM-12715](https://issues.apache.org/jira/browse/BEAM-12715)) -* Java SDK: Update Google Cloud Healthcare IO connectors from using v1beta1 to using the GA version. +- Support for X source added (Java/Python) ([BEAM-X](https://issues.apache.org/jira/browse/BEAM-X)). +- Added ability to use JdbcIO.Write.withResults without statement and preparedStatementSetter. ([BEAM-12511](https://issues.apache.org/jira/browse/BEAM-12511)) + +* Added ability to register URI schemes to use the S3 protocol via FileIO. ([BEAM-12435](https://issues.apache.org/jira/browse/BEAM-12435)). + +- Respect number of shards set in SnowflakeWrite batch mode. ([BEAM-12715](https://issues.apache.org/jira/browse/BEAM-12715)) +- Java SDK: Update Google Cloud Healthcare IO connectors from using v1beta1 to using the GA version. ## New Features / Improvements -* X feature added (Java/Python) ([BEAM-X](https://issues.apache.org/jira/browse/BEAM-X)). -* Add support to convert Beam Schema to Avro Schema for JDBC LogicalTypes: +- X feature added (Java/Python) ([BEAM-X](https://issues.apache.org/jira/browse/BEAM-X)). +- Add support to convert Beam Schema to Avro Schema for JDBC LogicalTypes: `VARCHAR`, `NVARCHAR`, `LONGVARCHAR`, `LONGNVARCHAR`, `DATE`, `TIME` (Java)([BEAM-12385](https://issues.apache.org/jira/browse/BEAM-12385)). -* Reading from JDBC source by partitions (Java) ([BEAM-12456](https://issues.apache.org/jira/browse/BEAM-12456)). -* PubsubIO can now write to a dead-letter topic after a parsing error (Java)([BEAM-12474](https://issues.apache.org/jira/browse/BEAM-12474)). -* New append-only option for Elasticsearch sink (Java) [BEAM-12601](https://issues.apache.org/jira/browse/BEAM-12601) +- Reading from JDBC source by partitions (Java) ([BEAM-12456](https://issues.apache.org/jira/browse/BEAM-12456)). +- PubsubIO can now write to a dead-letter topic after a parsing error (Java)([BEAM-12474](https://issues.apache.org/jira/browse/BEAM-12474)). +- New append-only option for Elasticsearch sink (Java) [BEAM-12601](https://issues.apache.org/jira/browse/BEAM-12601) ## Breaking Changes -* X behavior was changed ([BEAM-X](https://issues.apache.org/jira/browse/BEAM-X)). -* ListShards (with DescribeStreamSummary) is used instead of DescribeStream to list shards in Kinesis streams. Due to this change, as mentioned in [AWS documentation](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_ListShards.html), for fine-grained IAM policies it is required to update them to allow calls to ListShards and DescribeStreamSummary APIs. For more information, see [Controlling Access to Amazon Kinesis Data Streams](https://docs.aws.amazon.com/streams/latest/dev/controlling-access.html) ([BEAM-12225](https://issues.apache.org/jira/browse/BEAM-12225)). +- X behavior was changed ([BEAM-X](https://issues.apache.org/jira/browse/BEAM-X)). +- ListShards (with DescribeStreamSummary) is used instead of DescribeStream to list shards in Kinesis streams. Due to this change, as mentioned in [AWS documentation](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_ListShards.html), for fine-grained IAM policies it is required to update them to allow calls to ListShards and DescribeStreamSummary APIs. For more information, see [Controlling Access to Amazon Kinesis Data Streams](https://docs.aws.amazon.com/streams/latest/dev/controlling-access.html) ([BEAM-12225](https://issues.apache.org/jira/browse/BEAM-12225)). ## Deprecations -* Python GBK will stop supporting unbounded PCollections that have global windowing and a default trigger in Beam 2.33. This can be overriden with `--allow_unsafe_triggers`. ([BEAM-9487](https://issues.apache.org/jira/browse/BEAM-9487)). -* Python GBK will start requiring safe triggers or the `--allow_unsafe_triggers` flag starting with Beam 2.33. ([BEAM-9487](https://issues.apache.org/jira/browse/BEAM-9487)). +- Python GBK will stop supporting unbounded PCollections that have global windowing and a default trigger in Beam 2.33. This can be overriden with `--allow_unsafe_triggers`. ([BEAM-9487](https://issues.apache.org/jira/browse/BEAM-9487)). +- Python GBK will start requiring safe triggers or the `--allow_unsafe_triggers` flag starting with Beam 2.33. ([BEAM-9487](https://issues.apache.org/jira/browse/BEAM-9487)). ## Known Issues -* Fixed X (Java/Python) ([BEAM-X](https://issues.apache.org/jira/browse/BEAM-X)). -* Fixed race condition in RabbitMqIO causing duplicate acks (Java) ([BEAM-6516](https://issues.apache.org/jira/browse/BEAM-6516))) +- Fixed X (Java/Python) ([BEAM-X](https://issues.apache.org/jira/browse/BEAM-X)). +- Fixed race condition in RabbitMqIO causing duplicate acks (Java) ([BEAM-6516](https://issues.apache.org/jira/browse/BEAM-6516))) # [2.31.0] - 2021-07-08 ## I/Os -* Fixed bug in ReadFromBigQuery when a RuntimeValueProvider is used as value of table argument (Python) ([BEAM-12514](https://issues.apache.org/jira/browse/BEAM-12514)). +- Fixed bug in ReadFromBigQuery when a RuntimeValueProvider is used as value of table argument (Python) ([BEAM-12514](https://issues.apache.org/jira/browse/BEAM-12514)). ## New Features / Improvements -* `CREATE FUNCTION` DDL statement added to Calcite SQL syntax. `JAR` and `AGGREGATE` are now reserved keywords. ([BEAM-12339](https://issues.apache.org/jira/browse/BEAM-12339)). -* Flink 1.13 is now supported by the Flink runner ([BEAM-12277](https://issues.apache.org/jira/browse/BEAM-12277)). -* DatastoreIO: Write and delete operations now follow automatic gradual ramp-up, +- `CREATE FUNCTION` DDL statement added to Calcite SQL syntax. `JAR` and `AGGREGATE` are now reserved keywords. ([BEAM-12339](https://issues.apache.org/jira/browse/BEAM-12339)). +- Flink 1.13 is now supported by the Flink runner ([BEAM-12277](https://issues.apache.org/jira/browse/BEAM-12277)). +- DatastoreIO: Write and delete operations now follow automatic gradual ramp-up, in line with best practices (Java/Python) ([BEAM-12260](https://issues.apache.org/jira/browse/BEAM-12260), [BEAM-12272](https://issues.apache.org/jira/browse/BEAM-12272)). -* Python `TriggerFn` has a new `may_lose_data` method to signal potential data loss. Default behavior assumes safe (necessary for backwards compatibility). See Deprecations for potential impact of overriding this. ([BEAM-9487](https://issues.apache.org/jira/browse/BEAM-9487)). +- Python `TriggerFn` has a new `may_lose_data` method to signal potential data loss. Default behavior assumes safe (necessary for backwards compatibility). See Deprecations for potential impact of overriding this. ([BEAM-9487](https://issues.apache.org/jira/browse/BEAM-9487)). ## Breaking Changes -* Python Row objects are now sensitive to field order. So `Row(x=3, y=4)` is no +- Python Row objects are now sensitive to field order. So `Row(x=3, y=4)` is no longer considered equal to `Row(y=4, x=3)` (BEAM-11929). -* Kafka Beam SQL tables now ascribe meaning to the LOCATION field; previously +- Kafka Beam SQL tables now ascribe meaning to the LOCATION field; previously it was ignored if provided. -* `TopCombineFn` disallow `compare` as its argument (Python) ([BEAM-7372](https://issues.apache.org/jira/browse/BEAM-7372)). -* Drop support for Flink 1.10 ([BEAM-12281](https://issues.apache.org/jira/browse/BEAM-12281)). +- `TopCombineFn` disallow `compare` as its argument (Python) ([BEAM-7372](https://issues.apache.org/jira/browse/BEAM-7372)). +- Drop support for Flink 1.10 ([BEAM-12281](https://issues.apache.org/jira/browse/BEAM-12281)). ## Deprecations -* Python GBK will stop supporting unbounded PCollections that have global windowing and a default trigger in Beam 2.33. This can be overriden with `--allow_unsafe_triggers`. ([BEAM-9487](https://issues.apache.org/jira/browse/BEAM-9487)). -* Python GBK will start requiring safe triggers or the `--allow_unsafe_triggers` flag starting with Beam 2.33. ([BEAM-9487](https://issues.apache.org/jira/browse/BEAM-9487)). +- Python GBK will stop supporting unbounded PCollections that have global windowing and a default trigger in Beam 2.33. This can be overriden with `--allow_unsafe_triggers`. ([BEAM-9487](https://issues.apache.org/jira/browse/BEAM-9487)). +- Python GBK will start requiring safe triggers or the `--allow_unsafe_triggers` flag starting with Beam 2.33. ([BEAM-9487](https://issues.apache.org/jira/browse/BEAM-9487)). # [2.30.0] - 2021-06-09 ## I/Os -* Allow splitting apart document serialization and IO for ElasticsearchIO -* Support Bulk API request size optimization through addition of ElasticsearchIO.Write.withStatefulBatches +- Allow splitting apart document serialization and IO for ElasticsearchIO +- Support Bulk API request size optimization through addition of ElasticsearchIO.Write.withStatefulBatches ## New Features / Improvements -* Added capability to declare resource hints in Java and Python SDKs ([BEAM-2085](https://issues.apache.org/jira/browse/BEAM-2085)). -* Added Spanner IO Performance tests for read and write. (Python) ([BEAM-10029](https://issues.apache.org/jira/browse/BEAM-10029)). -* Added support for accessing GCP PubSub Message ordering keys, message IDs and message publish timestamp (Python) ([BEAM-7819](https://issues.apache.org/jira/browse/BEAM-7819)). -* DataFrame API: Added support for collecting DataFrame objects in interactive Beam ([BEAM-11855](https://issues.apache.org/jira/browse/BEAM-11855)) -* DataFrame API: Added [apache_beam.examples.dataframe](https://github.com/apache/beam/tree/master/sdks/python/apache_beam/examples/dataframe) module ([BEAM-12024](https://issues.apache.org/jira/browse/BEAM-12024)) -* Upgraded the GCP Libraries BOM version to 20.0.0 ([BEAM-11205](https://issues.apache.org/jira/browse/BEAM-11205)). +- Added capability to declare resource hints in Java and Python SDKs ([BEAM-2085](https://issues.apache.org/jira/browse/BEAM-2085)). +- Added Spanner IO Performance tests for read and write. (Python) ([BEAM-10029](https://issues.apache.org/jira/browse/BEAM-10029)). +- Added support for accessing GCP PubSub Message ordering keys, message IDs and message publish timestamp (Python) ([BEAM-7819](https://issues.apache.org/jira/browse/BEAM-7819)). +- DataFrame API: Added support for collecting DataFrame objects in interactive Beam ([BEAM-11855](https://issues.apache.org/jira/browse/BEAM-11855)) +- DataFrame API: Added [apache_beam.examples.dataframe](https://github.com/apache/beam/tree/master/sdks/python/apache_beam/examples/dataframe) module ([BEAM-12024](https://issues.apache.org/jira/browse/BEAM-12024)) +- Upgraded the GCP Libraries BOM version to 20.0.0 ([BEAM-11205](https://issues.apache.org/jira/browse/BEAM-11205)). For Google Cloud client library versions set by this BOM, see [this table](https://storage.googleapis.com/cloud-opensource-java-dashboard/com.google.cloud/libraries-bom/20.0.0/artifact_details.html). ## Breaking Changes -* Drop support for Flink 1.8 and 1.9 ([BEAM-11948](https://issues.apache.org/jira/browse/BEAM-11948)). -* MongoDbIO: Read.withFilter() and Read.withProjection() are removed since they are deprecated since +- Drop support for Flink 1.8 and 1.9 ([BEAM-11948](https://issues.apache.org/jira/browse/BEAM-11948)). +- MongoDbIO: Read.withFilter() and Read.withProjection() are removed since they are deprecated since Beam 2.12.0 ([BEAM-12217](https://issues.apache.org/jira/browse/BEAM-12217)). -* RedisIO.readAll() was removed since it was deprecated since Beam 2.13.0. Please use +- RedisIO.readAll() was removed since it was deprecated since Beam 2.13.0. Please use RedisIO.readKeyPatterns() for the equivalent functionality. ([BEAM-12214](https://issues.apache.org/jira/browse/BEAM-12214)). -* MqttIO.create() with clientId constructor removed because it was deprecated since Beam +- MqttIO.create() with clientId constructor removed because it was deprecated since Beam 2.13.0 ([BEAM-12216](https://issues.apache.org/jira/browse/BEAM-12216)). # [2.29.0] - 2021-04-29 ## Highlights -* Spark Classic and Portable runners officially support Spark 3 ([BEAM-7093](https://issues.apache.org/jira/browse/BEAM-7093)). -* Official Java 11 support for most runners (Dataflow, Flink, Spark) ([BEAM-2530](https://issues.apache.org/jira/browse/BEAM-2530)). -* DataFrame API now supports GroupBy.apply ([BEAM-11628](https://issues.apache.org/jira/browse/BEAM-11628)). +- Spark Classic and Portable runners officially support Spark 3 ([BEAM-7093](https://issues.apache.org/jira/browse/BEAM-7093)). +- Official Java 11 support for most runners (Dataflow, Flink, Spark) ([BEAM-2530](https://issues.apache.org/jira/browse/BEAM-2530)). +- DataFrame API now supports GroupBy.apply ([BEAM-11628](https://issues.apache.org/jira/browse/BEAM-11628)). ## I/Os -* Added support for S3 filesystem on AWS SDK V2 (Java) ([BEAM-7637](https://issues.apache.org/jira/browse/BEAM-7637)) +- Added support for S3 filesystem on AWS SDK V2 (Java) ([BEAM-7637](https://issues.apache.org/jira/browse/BEAM-7637)) ## New Features / Improvements -* DataFrame API now supports pandas 1.2.x ([BEAM-11531](https://issues.apache.org/jira/browse/BEAM-11531)). -* Multiple DataFrame API bugfixes ([BEAM-12071](https://issues.apache/jira/browse/BEAM-12071), [BEAM-11929](https://issues.apache/jira/browse/BEAM-11929)) +- DataFrame API now supports pandas 1.2.x ([BEAM-11531](https://issues.apache.org/jira/browse/BEAM-11531)). +- Multiple DataFrame API bugfixes ([BEAM-12071](https://issues.apache/jira/browse/BEAM-12071), [BEAM-11929](https://issues.apache/jira/browse/BEAM-11929)) ## Breaking Changes -* Deterministic coding enforced for GroupByKey and Stateful DoFns. Previously non-deterministic coding was allowed, resulting in keys not properly being grouped in some cases. ([BEAM-11719](https://issues.apache.org/jira/browse/BEAM-11719)) +- Deterministic coding enforced for GroupByKey and Stateful DoFns. Previously non-deterministic coding was allowed, resulting in keys not properly being grouped in some cases. ([BEAM-11719](https://issues.apache.org/jira/browse/BEAM-11719)) To restore the old behavior, one can register `FakeDeterministicFastPrimitivesCoder` with `beam.coders.registry.register_fallback_coder(beam.coders.coders.FakeDeterministicFastPrimitivesCoder())` or use the `allow_non_deterministic_key_coders` pipeline option. ## Deprecations -* Support for Flink 1.8 and 1.9 will be removed in the next release (2.30.0) ([BEAM-11948](https://issues.apache.org/jira/browse/BEAM-11948)). +- Support for Flink 1.8 and 1.9 will be removed in the next release (2.30.0) ([BEAM-11948](https://issues.apache.org/jira/browse/BEAM-11948)). # [2.28.0] - 2021-02-22 ## Highlights -* Many improvements related to Parquet support ([BEAM-11460](https://issues.apache.org/jira/browse/BEAM-11460), [BEAM-8202](https://issues.apache.org/jira/browse/BEAM-8202), and [BEAM-11526](https://issues.apache.org/jira/browse/BEAM-11526)) -* Hash Functions in BeamSQL ([BEAM-10074](https://issues.apache.org/jira/browse/BEAM-10074)) -* Hash functions in ZetaSQL ([BEAM-11624](https://issues.apache.org/jira/browse/BEAM-11624)) -* Create ApproximateDistinct using HLL Impl ([BEAM-10324](https://issues.apache.org/jira/browse/BEAM-10324)) + +- Many improvements related to Parquet support ([BEAM-11460](https://issues.apache.org/jira/browse/BEAM-11460), [BEAM-8202](https://issues.apache.org/jira/browse/BEAM-8202), and [BEAM-11526](https://issues.apache.org/jira/browse/BEAM-11526)) +- Hash Functions in BeamSQL ([BEAM-10074](https://issues.apache.org/jira/browse/BEAM-10074)) +- Hash functions in ZetaSQL ([BEAM-11624](https://issues.apache.org/jira/browse/BEAM-11624)) +- Create ApproximateDistinct using HLL Impl ([BEAM-10324](https://issues.apache.org/jira/browse/BEAM-10324)) ## I/Os -* SpannerIO supports using BigDecimal for Numeric fields ([BEAM-11643](https://issues.apache.org/jira/browse/BEAM-11643)) -* Add Beam schema support to ParquetIO ([BEAM-11526](https://issues.apache.org/jira/browse/BEAM-11526)) -* Support ParquetTable Writer ([BEAM-8202](https://issues.apache.org/jira/browse/BEAM-8202)) -* GCP BigQuery sink (streaming inserts) uses runner determined sharding ([BEAM-11408](https://issues.apache.org/jira/browse/BEAM-11408)) -* PubSub support types: TIMESTAMP, DATE, TIME, DATETIME ([BEAM-11533](https://issues.apache.org/jira/browse/BEAM-11533)) +- SpannerIO supports using BigDecimal for Numeric fields ([BEAM-11643](https://issues.apache.org/jira/browse/BEAM-11643)) +- Add Beam schema support to ParquetIO ([BEAM-11526](https://issues.apache.org/jira/browse/BEAM-11526)) +- Support ParquetTable Writer ([BEAM-8202](https://issues.apache.org/jira/browse/BEAM-8202)) +- GCP BigQuery sink (streaming inserts) uses runner determined sharding ([BEAM-11408](https://issues.apache.org/jira/browse/BEAM-11408)) +- PubSub support types: TIMESTAMP, DATE, TIME, DATETIME ([BEAM-11533](https://issues.apache.org/jira/browse/BEAM-11533)) ## New Features / Improvements -* ParquetIO add methods _readGenericRecords_ and _readFilesGenericRecords_ can read files with an unknown schema. See [PR-13554](https://github.com/apache/beam/pull/13554) and ([BEAM-11460](https://issues.apache.org/jira/browse/BEAM-11460)) -* Added support for thrift in KafkaTableProvider ([BEAM-11482](https://issues.apache.org/jira/browse/BEAM-11482)) -* Added support for HadoopFormatIO to skip key/value clone ([BEAM-11457](https://issues.apache.org/jira/browse/BEAM-11457)) -* Support Conversion to GenericRecords in Convert.to transform ([BEAM-11571](https://issues.apache.org/jira/browse/BEAM-11571)). -* Support writes for Parquet Tables in Beam SQL ([BEAM-8202](https://issues.apache.org/jira/browse/BEAM-8202)). -* Support reading Parquet files with unknown schema ([BEAM-11460](https://issues.apache.org/jira/browse/BEAM-11460)) -* Support user configurable Hadoop Configuration flags for ParquetIO ([BEAM-11527](https://issues.apache.org/jira/browse/BEAM-11527)) -* Expose commit_offset_in_finalize and timestamp_policy to ReadFromKafka ([BEAM-11677](https://issues.apache.org/jira/browse/BEAM-11677)) -* S3 options does not provided to boto3 client while using FlinkRunner and Beam worker pool container ([BEAM-11799](https://issues.apache.org/jira/browse/BEAM-11799)) -* HDFS not deduplicating identical configuration paths ([BEAM-11329](https://issues.apache.org/jira/browse/BEAM-11329)) -* Hash Functions in BeamSQL ([BEAM-10074](https://issues.apache.org/jira/browse/BEAM-10074)) -* Create ApproximateDistinct using HLL Impl ([BEAM-10324](https://issues.apache.org/jira/browse/BEAM-10324)) -* Add Beam schema support to ParquetIO ([BEAM-11526](https://issues.apache.org/jira/browse/BEAM-11526)) -* Add a Deque Encoder ([BEAM-11538](https://issues.apache.org/jira/browse/BEAM-11538)) -* Hash functions in ZetaSQL ([BEAM-11624](https://issues.apache.org/jira/browse/BEAM-11624)) -* Refactor ParquetTableProvider ([](https://issues.apache.org/jira/browse/)) -* Add JVM properties to JavaJobServer ([BEAM-8344](https://issues.apache.org/jira/browse/BEAM-8344)) -* Single source of truth for supported Flink versions ([](https://issues.apache.org/jira/browse/)) -* Use metric for Python BigQuery streaming insert API latency logging ([BEAM-11018](https://issues.apache.org/jira/browse/BEAM-11018)) -* Use metric for Java BigQuery streaming insert API latency logging ([BEAM-11032](https://issues.apache.org/jira/browse/BEAM-11032)) -* Upgrade Flink runner to Flink versions 1.12.1 and 1.11.3 ([BEAM-11697](https://issues.apache.org/jira/browse/BEAM-11697)) -* Upgrade Beam base image to use Tensorflow 2.4.1 ([BEAM-11762](https://issues.apache.org/jira/browse/BEAM-11762)) -* Create Beam GCP BOM ([BEAM-11665](https://issues.apache.org/jira/browse/BEAM-11665)) +- ParquetIO add methods _readGenericRecords_ and _readFilesGenericRecords_ can read files with an unknown schema. See [PR-13554](https://github.com/apache/beam/pull/13554) and ([BEAM-11460](https://issues.apache.org/jira/browse/BEAM-11460)) +- Added support for thrift in KafkaTableProvider ([BEAM-11482](https://issues.apache.org/jira/browse/BEAM-11482)) +- Added support for HadoopFormatIO to skip key/value clone ([BEAM-11457](https://issues.apache.org/jira/browse/BEAM-11457)) +- Support Conversion to GenericRecords in Convert.to transform ([BEAM-11571](https://issues.apache.org/jira/browse/BEAM-11571)). +- Support writes for Parquet Tables in Beam SQL ([BEAM-8202](https://issues.apache.org/jira/browse/BEAM-8202)). +- Support reading Parquet files with unknown schema ([BEAM-11460](https://issues.apache.org/jira/browse/BEAM-11460)) +- Support user configurable Hadoop Configuration flags for ParquetIO ([BEAM-11527](https://issues.apache.org/jira/browse/BEAM-11527)) +- Expose commit_offset_in_finalize and timestamp_policy to ReadFromKafka ([BEAM-11677](https://issues.apache.org/jira/browse/BEAM-11677)) +- S3 options does not provided to boto3 client while using FlinkRunner and Beam worker pool container ([BEAM-11799](https://issues.apache.org/jira/browse/BEAM-11799)) +- HDFS not deduplicating identical configuration paths ([BEAM-11329](https://issues.apache.org/jira/browse/BEAM-11329)) +- Hash Functions in BeamSQL ([BEAM-10074](https://issues.apache.org/jira/browse/BEAM-10074)) +- Create ApproximateDistinct using HLL Impl ([BEAM-10324](https://issues.apache.org/jira/browse/BEAM-10324)) +- Add Beam schema support to ParquetIO ([BEAM-11526](https://issues.apache.org/jira/browse/BEAM-11526)) +- Add a Deque Encoder ([BEAM-11538](https://issues.apache.org/jira/browse/BEAM-11538)) +- Hash functions in ZetaSQL ([BEAM-11624](https://issues.apache.org/jira/browse/BEAM-11624)) +- Refactor ParquetTableProvider ([](https://issues.apache.org/jira/browse/)) +- Add JVM properties to JavaJobServer ([BEAM-8344](https://issues.apache.org/jira/browse/BEAM-8344)) +- Single source of truth for supported Flink versions ([](https://issues.apache.org/jira/browse/)) +- Use metric for Python BigQuery streaming insert API latency logging ([BEAM-11018](https://issues.apache.org/jira/browse/BEAM-11018)) +- Use metric for Java BigQuery streaming insert API latency logging ([BEAM-11032](https://issues.apache.org/jira/browse/BEAM-11032)) +- Upgrade Flink runner to Flink versions 1.12.1 and 1.11.3 ([BEAM-11697](https://issues.apache.org/jira/browse/BEAM-11697)) +- Upgrade Beam base image to use Tensorflow 2.4.1 ([BEAM-11762](https://issues.apache.org/jira/browse/BEAM-11762)) +- Create Beam GCP BOM ([BEAM-11665](https://issues.apache.org/jira/browse/BEAM-11665)) ## Breaking Changes -* The Java artifacts "beam-sdks-java-io-kinesis", "beam-sdks-java-io-google-cloud-platform", and +- The Java artifacts "beam-sdks-java-io-kinesis", "beam-sdks-java-io-google-cloud-platform", and "beam-sdks-java-extensions-sql-zetasql" declare Guava 30.1-jre dependency (It was 25.1-jre in Beam 2.27.0). This new Guava version may introduce dependency conflicts if your project or dependencies rely on removed APIs. If affected, ensure to use an appropriate Guava version via `dependencyManagement` in Maven and `force` in Gradle. - # [2.27.0] - 2021-01-08 ## I/Os -* ReadFromMongoDB can now be used with MongoDB Atlas (Python) ([BEAM-11266](https://issues.apache.org/jira/browse/BEAM-11266).) -* ReadFromMongoDB/WriteToMongoDB will mask password in display_data (Python) ([BEAM-11444](https://issues.apache.org/jira/browse/BEAM-11444).) -* Support for X source added (Java/Python) ([BEAM-X](https://issues.apache.org/jira/browse/BEAM-X)). -* There is a new transform `ReadAllFromBigQuery` that can receive multiple requests to read data from BigQuery at pipeline runtime. See [PR 13170](https://github.com/apache/beam/pull/13170), and [BEAM-9650](https://issues.apache.org/jira/browse/BEAM-9650). + +- ReadFromMongoDB can now be used with MongoDB Atlas (Python) ([BEAM-11266](https://issues.apache.org/jira/browse/BEAM-11266).) +- ReadFromMongoDB/WriteToMongoDB will mask password in display_data (Python) ([BEAM-11444](https://issues.apache.org/jira/browse/BEAM-11444).) +- Support for X source added (Java/Python) ([BEAM-X](https://issues.apache.org/jira/browse/BEAM-X)). +- There is a new transform `ReadAllFromBigQuery` that can receive multiple requests to read data from BigQuery at pipeline runtime. See [PR 13170](https://github.com/apache/beam/pull/13170), and [BEAM-9650](https://issues.apache.org/jira/browse/BEAM-9650). ## New Features / Improvements -* Beam modules that depend on Hadoop are now tested for compatibility with Hadoop 3 ([BEAM-8569](https://issues.apache.org/jira/browse/BEAM-8569)). (Hive/HCatalog pending) -* Publishing Java 11 SDK container images now supported as part of Apache Beam release process. ([BEAM-8106](https://issues.apache.org/jira/browse/BEAM-8106)) -* Added Cloud Bigtable Provider extension to Beam SQL ([BEAM-11173](https://issues.apache.org/jira/browse/BEAM-11173), [BEAM-11373](https://issues.apache.org/jira/browse/BEAM-11373)) -* Added a schema provider for thrift data ([BEAM-11338](https://issues.apache.org/jira/browse/BEAM-11338)) -* Added combiner packing pipeline optimization to Dataflow runner. ([BEAM-10641](https://issues.apache.org/jira/browse/BEAM-10641)) -* Support for the Deque structure by adding a coder ([BEAM-11538](https://issues.apache.org/jira/browse/BEAM-11538)) +- Beam modules that depend on Hadoop are now tested for compatibility with Hadoop 3 ([BEAM-8569](https://issues.apache.org/jira/browse/BEAM-8569)). (Hive/HCatalog pending) +- Publishing Java 11 SDK container images now supported as part of Apache Beam release process. ([BEAM-8106](https://issues.apache.org/jira/browse/BEAM-8106)) +- Added Cloud Bigtable Provider extension to Beam SQL ([BEAM-11173](https://issues.apache.org/jira/browse/BEAM-11173), [BEAM-11373](https://issues.apache.org/jira/browse/BEAM-11373)) +- Added a schema provider for thrift data ([BEAM-11338](https://issues.apache.org/jira/browse/BEAM-11338)) +- Added combiner packing pipeline optimization to Dataflow runner. ([BEAM-10641](https://issues.apache.org/jira/browse/BEAM-10641)) +- Support for the Deque structure by adding a coder ([BEAM-11538](https://issues.apache.org/jira/browse/BEAM-11538)) ## Breaking Changes -* HBaseIO hbase-shaded-client dependency should be now provided by the users ([BEAM-9278](https://issues.apache.org/jira/browse/BEAM-9278)). -* `--region` flag in amazon-web-services2 was replaced by `--awsRegion` ([BEAM-11331](https://issues.apache.org/jira/projects/BEAM/issues/BEAM-11331)). +- HBaseIO hbase-shaded-client dependency should be now provided by the users ([BEAM-9278](https://issues.apache.org/jira/browse/BEAM-9278)). +- `--region` flag in amazon-web-services2 was replaced by `--awsRegion` ([BEAM-11331](https://issues.apache.org/jira/projects/BEAM/issues/BEAM-11331)). # [2.26.0] - 2020-12-11 ## Highlights -* Splittable DoFn is now the default for executing the Read transform for Java based runners (Spark with bounded pipelines) in addition to existing runners from the 2.25.0 release (Direct, Flink, Jet, Samza, Twister2). The expected output of the Read transform is unchanged. Users can opt-out using `--experiments=use_deprecated_read`. The Apache Beam community is looking for feedback for this change as the community is planning to make this change permanent with no opt-out. If you run into an issue requiring the opt-out, please send an e-mail to [user@beam.apache.org](mailto:user@beam.apache.org) specifically referencing BEAM-10670 in the subject line and why you needed to opt-out. (Java) ([BEAM-10670](https://issues.apache.org/jira/browse/BEAM-10670)) +- Splittable DoFn is now the default for executing the Read transform for Java based runners (Spark with bounded pipelines) in addition to existing runners from the 2.25.0 release (Direct, Flink, Jet, Samza, Twister2). The expected output of the Read transform is unchanged. Users can opt-out using `--experiments=use_deprecated_read`. The Apache Beam community is looking for feedback for this change as the community is planning to make this change permanent with no opt-out. If you run into an issue requiring the opt-out, please send an e-mail to [user@beam.apache.org](mailto:user@beam.apache.org) specifically referencing BEAM-10670 in the subject line and why you needed to opt-out. (Java) ([BEAM-10670](https://issues.apache.org/jira/browse/BEAM-10670)) ## I/Os -* Java BigQuery streaming inserts now have timeouts enabled by default. Pass `--HTTPWriteTimeout=0` to revert to the old behavior. ([BEAM-6103](https://issues.apache.org/jira/browse/BEAM-6103)) -* Added support for Contextual Text IO (Java), a version of text IO that provides metadata about the records ([BEAM-10124](https://issues.apache.org/jira/browse/BEAM-10124)). Support for this IO is currently experimental. Specifically, **there are no update-compatibility guarantees** for streaming jobs with this IO between current future verisons of Apache Beam SDK. +- Java BigQuery streaming inserts now have timeouts enabled by default. Pass `--HTTPWriteTimeout=0` to revert to the old behavior. ([BEAM-6103](https://issues.apache.org/jira/browse/BEAM-6103)) +- Added support for Contextual Text IO (Java), a version of text IO that provides metadata about the records ([BEAM-10124](https://issues.apache.org/jira/browse/BEAM-10124)). Support for this IO is currently experimental. Specifically, **there are no update-compatibility guarantees** for streaming jobs with this IO between current future verisons of Apache Beam SDK. ## New Features / Improvements -* Added support for avro payload format in Beam SQL Kafka Table ([BEAM-10885](https://issues.apache.org/jira/browse/BEAM-10885)) -* Added support for json payload format in Beam SQL Kafka Table ([BEAM-10893](https://issues.apache.org/jira/browse/BEAM-10893)) -* Added support for protobuf payload format in Beam SQL Kafka Table ([BEAM-10892](https://issues.apache.org/jira/browse/BEAM-10892)) -* Added support for avro payload format in Beam SQL Pubsub Table ([BEAM-5504](https://issues.apache.org/jira/browse/BEAM-5504)) -* Added option to disable unnecessary copying between operators in Flink Runner (Java) ([BEAM-11146](https://issues.apache.org/jira/browse/BEAM-11146)) -* Added CombineFn.setup and CombineFn.teardown to Python SDK. These methods let you initialize the CombineFn's state before any of the other methods of the CombineFn is executed and clean that state up later on. If you are using Dataflow, you need to enable Dataflow Runner V2 by passing `--experiments=use_runner_v2` before using this feature. ([BEAM-3736](https://issues.apache.org/jira/browse/BEAM-3736)) -* Added support for NestedValueProvider for the Python SDK ([BEAM-10856](https://issues.apache.org/jira/browse/BEAM-10856)). + +- Added support for avro payload format in Beam SQL Kafka Table ([BEAM-10885](https://issues.apache.org/jira/browse/BEAM-10885)) +- Added support for json payload format in Beam SQL Kafka Table ([BEAM-10893](https://issues.apache.org/jira/browse/BEAM-10893)) +- Added support for protobuf payload format in Beam SQL Kafka Table ([BEAM-10892](https://issues.apache.org/jira/browse/BEAM-10892)) +- Added support for avro payload format in Beam SQL Pubsub Table ([BEAM-5504](https://issues.apache.org/jira/browse/BEAM-5504)) +- Added option to disable unnecessary copying between operators in Flink Runner (Java) ([BEAM-11146](https://issues.apache.org/jira/browse/BEAM-11146)) +- Added CombineFn.setup and CombineFn.teardown to Python SDK. These methods let you initialize the CombineFn's state before any of the other methods of the CombineFn is executed and clean that state up later on. If you are using Dataflow, you need to enable Dataflow Runner V2 by passing `--experiments=use_runner_v2` before using this feature. ([BEAM-3736](https://issues.apache.org/jira/browse/BEAM-3736)) +- Added support for NestedValueProvider for the Python SDK ([BEAM-10856](https://issues.apache.org/jira/browse/BEAM-10856)). ## Breaking Changes -* BigQuery's DATETIME type now maps to Beam logical type org.apache.beam.sdk.schemas.logicaltypes.SqlTypes.DATETIME -* Pandas 1.x is now required for dataframe operations. +- BigQuery's DATETIME type now maps to Beam logical type org.apache.beam.sdk.schemas.logicaltypes.SqlTypes.DATETIME +- Pandas 1.x is now required for dataframe operations. ## Known Issues -* Non-idempotent combiners built via `CombineFn.from_callable()` or `CombineFn.maybe_from_callable()` can lead to incorrect behavior. ([BEAM-11522](https://issues.apache.org/jira/browse/BEAM-11522)). - +- Non-idempotent combiners built via `CombineFn.from_callable()` or `CombineFn.maybe_from_callable()` can lead to incorrect behavior. ([BEAM-11522](https://issues.apache.org/jira/browse/BEAM-11522)). # [2.25.0] - 2020-10-23 ## Highlights -* Splittable DoFn is now the default for executing the Read transform for Java based runners (Direct, Flink, Jet, Samza, Twister2). The expected output of the Read transform is unchanged. Users can opt-out using `--experiments=use_deprecated_read`. The Apache Beam community is looking for feedback for this change as the community is planning to make this change permanent with no opt-out. If you run into an issue requiring the opt-out, please send an e-mail to [user@beam.apache.org](mailto:user@beam.apache.org) specifically referencing BEAM-10670 in the subject line and why you needed to opt-out. (Java) ([BEAM-10670](https://issues.apache.org/jira/browse/BEAM-10670)) +- Splittable DoFn is now the default for executing the Read transform for Java based runners (Direct, Flink, Jet, Samza, Twister2). The expected output of the Read transform is unchanged. Users can opt-out using `--experiments=use_deprecated_read`. The Apache Beam community is looking for feedback for this change as the community is planning to make this change permanent with no opt-out. If you run into an issue requiring the opt-out, please send an e-mail to [user@beam.apache.org](mailto:user@beam.apache.org) specifically referencing BEAM-10670 in the subject line and why you needed to opt-out. (Java) ([BEAM-10670](https://issues.apache.org/jira/browse/BEAM-10670)) ## I/Os -* Added cross-language support to Java's KinesisIO, now available in the Python module `apache_beam.io.kinesis` ([BEAM-10138](https://issues.apache.org/jira/browse/BEAM-10138), [BEAM-10137](https://issues.apache.org/jira/browse/BEAM-10137)). -* Update Snowflake JDBC dependency for SnowflakeIO ([BEAM-10864](https://issues.apache.org/jira/browse/BEAM-10864)) -* Added cross-language support to Java's SnowflakeIO.Write, now available in the Python module `apache_beam.io.snowflake` ([BEAM-9898](https://issues.apache.org/jira/browse/BEAM-9898)). -* Added delete function to Java's `ElasticsearchIO#Write`. Now, Java's ElasticsearchIO can be used to selectively delete documents using `withIsDeleteFn` function ([BEAM-5757](https://issues.apache.org/jira/browse/BEAM-5757)). -* Java SDK: Added new IO connector for InfluxDB - InfluxDbIO ([BEAM-2546](https://issues.apache.org/jira/browse/BEAM-2546)). -* Config options added for Python's S3IO ([BEAM-9094](https://issues.apache.org/jira/browse/BEAM-9094)) +- Added cross-language support to Java's KinesisIO, now available in the Python module `apache_beam.io.kinesis` ([BEAM-10138](https://issues.apache.org/jira/browse/BEAM-10138), [BEAM-10137](https://issues.apache.org/jira/browse/BEAM-10137)). +- Update Snowflake JDBC dependency for SnowflakeIO ([BEAM-10864](https://issues.apache.org/jira/browse/BEAM-10864)) +- Added cross-language support to Java's SnowflakeIO.Write, now available in the Python module `apache_beam.io.snowflake` ([BEAM-9898](https://issues.apache.org/jira/browse/BEAM-9898)). +- Added delete function to Java's `ElasticsearchIO#Write`. Now, Java's ElasticsearchIO can be used to selectively delete documents using `withIsDeleteFn` function ([BEAM-5757](https://issues.apache.org/jira/browse/BEAM-5757)). +- Java SDK: Added new IO connector for InfluxDB - InfluxDbIO ([BEAM-2546](https://issues.apache.org/jira/browse/BEAM-2546)). +- Config options added for Python's S3IO ([BEAM-9094](https://issues.apache.org/jira/browse/BEAM-9094)) ## New Features / Improvements -* Support for repeatable fields in JSON decoder for `ReadFromBigQuery` added. (Python) ([BEAM-10524](https://issues.apache.org/jira/browse/BEAM-10524)) -* Added an opt-in, performance-driven runtime type checking system for the Python SDK ([BEAM-10549](https://issues.apache.org/jira/browse/BEAM-10549)). - More details will be in an upcoming [blog post](https://beam.apache.org/blog/python-performance-runtime-type-checking/index.html). -* Added support for Python 3 type annotations on PTransforms using typed PCollections ([BEAM-10258](https://issues.apache.org/jira/browse/BEAM-10258)). - More details will be in an upcoming [blog post](https://beam.apache.org/blog/python-improved-annotations/index.html). -* Improved the Interactive Beam API where recording streaming jobs now start a long running background recording job. Running ib.show() or ib.collect() samples from the recording ([BEAM-10603](https://issues.apache.org/jira/browse/BEAM-10603)). -* In Interactive Beam, ib.show() and ib.collect() now have "n" and "duration" as parameters. These mean read only up to "n" elements and up to "duration" seconds of data read from the recording ([BEAM-10603](https://issues.apache.org/jira/browse/BEAM-10603)). -* Initial preview of [Dataframes](https://s.apache.org/simpler-python-pipelines-2020#slide=id.g905ac9257b_1_21) support. - See also example at apache_beam/examples/wordcount_dataframe.py -* Fixed support for type hints on `@ptransform_fn` decorators in the Python SDK. +- Support for repeatable fields in JSON decoder for `ReadFromBigQuery` added. (Python) ([BEAM-10524](https://issues.apache.org/jira/browse/BEAM-10524)) +- Added an opt-in, performance-driven runtime type checking system for the Python SDK ([BEAM-10549](https://issues.apache.org/jira/browse/BEAM-10549)). + More details will be in an upcoming [blog post](https://beam.apache.org/blog/python-performance-runtime-type-checking/index.html). +- Added support for Python 3 type annotations on PTransforms using typed PCollections ([BEAM-10258](https://issues.apache.org/jira/browse/BEAM-10258)). + More details will be in an upcoming [blog post](https://beam.apache.org/blog/python-improved-annotations/index.html). +- Improved the Interactive Beam API where recording streaming jobs now start a long running background recording job. Running ib.show() or ib.collect() samples from the recording ([BEAM-10603](https://issues.apache.org/jira/browse/BEAM-10603)). +- In Interactive Beam, ib.show() and ib.collect() now have "n" and "duration" as parameters. These mean read only up to "n" elements and up to "duration" seconds of data read from the recording ([BEAM-10603](https://issues.apache.org/jira/browse/BEAM-10603)). +- Initial preview of [Dataframes](https://s.apache.org/simpler-python-pipelines-2020#slide=id.g905ac9257b_1_21) support. + See also example at apache_beam/examples/wordcount_dataframe.py +- Fixed support for type hints on `@ptransform_fn` decorators in the Python SDK. ([BEAM-4091](https://issues.apache.org/jira/browse/BEAM-4091)) This has not enabled by default to preserve backwards compatibility; use the `--type_check_additional=ptransform_fn` flag to enable. It may be enabled by @@ -367,100 +371,99 @@ ## Breaking Changes -* Python 2 and Python 3.5 support dropped ([BEAM-10644](https://issues.apache.org/jira/browse/BEAM-10644), [BEAM-9372](https://issues.apache.org/jira/browse/BEAM-9372)). -* Pandas 1.x allowed. Older version of Pandas may still be used, but may not be as well tested. +- Python 2 and Python 3.5 support dropped ([BEAM-10644](https://issues.apache.org/jira/browse/BEAM-10644), [BEAM-9372](https://issues.apache.org/jira/browse/BEAM-9372)). +- Pandas 1.x allowed. Older version of Pandas may still be used, but may not be as well tested. ## Deprecations -* Python transform ReadFromSnowflake has been moved from `apache_beam.io.external.snowflake` to `apache_beam.io.snowflake`. The previous path will be removed in the future versions. +- Python transform ReadFromSnowflake has been moved from `apache_beam.io.external.snowflake` to `apache_beam.io.snowflake`. The previous path will be removed in the future versions. ## Known Issues -* Dataflow streaming timers once against not strictly time ordered when set earlier mid-bundle, as the fix for [BEAM-8543](https://issues.apache.org/jira/browse/BEAM-8543) introduced more severe bugs and has been rolled back. -* Default compressor change breaks dataflow python streaming job update compatibility. Please use python SDK version <= 2.23.0 or > 2.25.0 if job update is critical.([BEAM-11113](https://issues.apache.org/jira/browse/BEAM-11113)) - +- Dataflow streaming timers once against not strictly time ordered when set earlier mid-bundle, as the fix for [BEAM-8543](https://issues.apache.org/jira/browse/BEAM-8543) introduced more severe bugs and has been rolled back. +- Default compressor change breaks dataflow python streaming job update compatibility. Please use python SDK version <= 2.23.0 or > 2.25.0 if job update is critical.([BEAM-11113](https://issues.apache.org/jira/browse/BEAM-11113)) # [2.24.0] - 2020-09-18 ## Highlights -* Apache Beam 2.24.0 is the last release with Python 2 and Python 3.5 +- Apache Beam 2.24.0 is the last release with Python 2 and Python 3.5 support. ## I/Os -* New overloads for BigtableIO.Read.withKeyRange() and BigtableIO.Read.withRowFilter() +- New overloads for BigtableIO.Read.withKeyRange() and BigtableIO.Read.withRowFilter() methods that take ValueProvider as a parameter (Java) ([BEAM-10283](https://issues.apache.org/jira/browse/BEAM-10283)). -* The WriteToBigQuery transform (Python) in Dataflow Batch no longer relies on BigQuerySink by default. It relies on +- The WriteToBigQuery transform (Python) in Dataflow Batch no longer relies on BigQuerySink by default. It relies on a new, fully-featured transform based on file loads into BigQuery. To revert the behavior to the old implementation, you may use `--experiments=use_legacy_bq_sink`. -* Add cross-language support to Java's JdbcIO, now available in the Python module `apache_beam.io.jdbc` ([BEAM-10135](https://issues.apache.org/jira/browse/BEAM-10135), [BEAM-10136](https://issues.apache.org/jira/browse/BEAM-10136)). -* Add support of AWS SDK v2 for KinesisIO.Read (Java) ([BEAM-9702](https://issues.apache.org/jira/browse/BEAM-9702)). -* Add streaming support to SnowflakeIO in Java SDK ([BEAM-9896](https://issues.apache.org/jira/browse/BEAM-9896)) -* Support reading and writing to Google Healthcare DICOM APIs in Python SDK ([BEAM-10601](https://issues.apache.org/jira/browse/BEAM-10601)) -* Add dispositions for SnowflakeIO.write ([BEAM-10343](https://issues.apache.org/jira/browse/BEAM-10343)) -* Add cross-language support to SnowflakeIO.Read now available in the Python module `apache_beam.io.external.snowflake` ([BEAM-9897](https://issues.apache.org/jira/browse/BEAM-9897)). +- Add cross-language support to Java's JdbcIO, now available in the Python module `apache_beam.io.jdbc` ([BEAM-10135](https://issues.apache.org/jira/browse/BEAM-10135), [BEAM-10136](https://issues.apache.org/jira/browse/BEAM-10136)). +- Add support of AWS SDK v2 for KinesisIO.Read (Java) ([BEAM-9702](https://issues.apache.org/jira/browse/BEAM-9702)). +- Add streaming support to SnowflakeIO in Java SDK ([BEAM-9896](https://issues.apache.org/jira/browse/BEAM-9896)) +- Support reading and writing to Google Healthcare DICOM APIs in Python SDK ([BEAM-10601](https://issues.apache.org/jira/browse/BEAM-10601)) +- Add dispositions for SnowflakeIO.write ([BEAM-10343](https://issues.apache.org/jira/browse/BEAM-10343)) +- Add cross-language support to SnowflakeIO.Read now available in the Python module `apache_beam.io.external.snowflake` ([BEAM-9897](https://issues.apache.org/jira/browse/BEAM-9897)). ## New Features / Improvements -* Shared library for simplifying management of large shared objects added to Python SDK. An example use case is sharing a large TF model object across threads ([BEAM-10417](https://issues.apache.org/jira/browse/BEAM-10417)). -* Dataflow streaming timers are not strictly time ordered when set earlier mid-bundle ([BEAM-8543](https://issues.apache.org/jira/browse/BEAM-8543)). -* OnTimerContext should not create a new one when processing each element/timer in FnApiDoFnRunner ([BEAM-9839](https://issues.apache.org/jira/browse/BEAM-9839)) -* Key should be available in @OnTimer methods (Spark Runner) ([BEAM-9850](https://issues.apache.org/jira/browse/BEAM-9850)) +- Shared library for simplifying management of large shared objects added to Python SDK. An example use case is sharing a large TF model object across threads ([BEAM-10417](https://issues.apache.org/jira/browse/BEAM-10417)). +- Dataflow streaming timers are not strictly time ordered when set earlier mid-bundle ([BEAM-8543](https://issues.apache.org/jira/browse/BEAM-8543)). +- OnTimerContext should not create a new one when processing each element/timer in FnApiDoFnRunner ([BEAM-9839](https://issues.apache.org/jira/browse/BEAM-9839)) +- Key should be available in @OnTimer methods (Spark Runner) ([BEAM-9850](https://issues.apache.org/jira/browse/BEAM-9850)) ## Breaking Changes -* WriteToBigQuery transforms now require a GCS location to be provided through either +- WriteToBigQuery transforms now require a GCS location to be provided through either custom_gcs_temp_location in the constructor of WriteToBigQuery or the fallback option --temp_location, or pass method="STREAMING_INSERTS" to WriteToBigQuery ([BEAM-6928](https://issues.apache.org/jira/browse/BEAM-6928)). -* Python SDK now understands `typing.FrozenSet` type hints, which are not interchangeable with `typing.Set`. You may need to update your pipelines if type checking fails. ([BEAM-10197](https://issues.apache.org/jira/browse/BEAM-10197)) +- Python SDK now understands `typing.FrozenSet` type hints, which are not interchangeable with `typing.Set`. You may need to update your pipelines if type checking fails. ([BEAM-10197](https://issues.apache.org/jira/browse/BEAM-10197)) ## Known issues -* When a timer fires but is reset prior to being executed, a watermark hold may be leaked, causing a stuck pipeline [BEAM-10991](https://issues.apache.org/jira/browse/BEAM-10991). -* Default compressor change breaks dataflow python streaming job update compatibility. Please use python SDK version <= 2.23.0 or > 2.25.0 if job update is critical.([BEAM-11113](https://issues.apache.org/jira/browse/BEAM-11113)) +- When a timer fires but is reset prior to being executed, a watermark hold may be leaked, causing a stuck pipeline [BEAM-10991](https://issues.apache.org/jira/browse/BEAM-10991). +- Default compressor change breaks dataflow python streaming job update compatibility. Please use python SDK version <= 2.23.0 or > 2.25.0 if job update is critical.([BEAM-11113](https://issues.apache.org/jira/browse/BEAM-11113)) # [2.23.0] - 2020-06-29 ## Highlights -* Twister2 Runner ([BEAM-7304](https://issues.apache.org/jira/browse/BEAM-7304)). -* Python 3.8 support ([BEAM-8494](https://issues.apache.org/jira/browse/BEAM-8494)). +- Twister2 Runner ([BEAM-7304](https://issues.apache.org/jira/browse/BEAM-7304)). +- Python 3.8 support ([BEAM-8494](https://issues.apache.org/jira/browse/BEAM-8494)). ## I/Os -* Support for reading from Snowflake added (Java) ([BEAM-9722](https://issues.apache.org/jira/browse/BEAM-9722)). -* Support for writing to Splunk added (Java) ([BEAM-8596](https://issues.apache.org/jira/browse/BEAM-8596)). -* Support for assume role added (Java) ([BEAM-10335](https://issues.apache.org/jira/browse/BEAM-10335)). -* A new transform to read from BigQuery has been added: `apache_beam.io.gcp.bigquery.ReadFromBigQuery`. This transform +- Support for reading from Snowflake added (Java) ([BEAM-9722](https://issues.apache.org/jira/browse/BEAM-9722)). +- Support for writing to Splunk added (Java) ([BEAM-8596](https://issues.apache.org/jira/browse/BEAM-8596)). +- Support for assume role added (Java) ([BEAM-10335](https://issues.apache.org/jira/browse/BEAM-10335)). +- A new transform to read from BigQuery has been added: `apache_beam.io.gcp.bigquery.ReadFromBigQuery`. This transform is experimental. It reads data from BigQuery by exporting data to Avro files, and reading those files. It also supports reading data by exporting to JSON files. This has small differences in behavior for Time and Date-related fields. See Pydoc for more information. ## New Features / Improvements -* Update Snowflake JDBC dependency and add application=beam to connection URL ([BEAM-10383](https://issues.apache.org/jira/browse/BEAM-10383)). +- Update Snowflake JDBC dependency and add application=beam to connection URL ([BEAM-10383](https://issues.apache.org/jira/browse/BEAM-10383)). ## Breaking Changes -* `RowJson.RowJsonDeserializer`, `JsonToRow`, and `PubsubJsonTableProvider` now accept "implicit +- `RowJson.RowJsonDeserializer`, `JsonToRow`, and `PubsubJsonTableProvider` now accept "implicit nulls" by default when deserializing JSON (Java) ([BEAM-10220](https://issues.apache.org/jira/browse/BEAM-10220)). Previously nulls could only be represented with explicit null values, as in `{"foo": "bar", "baz": null}`, whereas an implicit null like `{"foo": "bar"}` would raise an exception. Now both JSON strings will yield the same result by default. This behavior can be overridden with `RowJson.RowJsonDeserializer#withNullBehavior`. -* Fixed a bug in `GroupIntoBatches` experimental transform in Python to actually group batches by key. +- Fixed a bug in `GroupIntoBatches` experimental transform in Python to actually group batches by key. This changes the output type for this transform ([BEAM-6696](https://issues.apache.org/jira/browse/BEAM-6696)). ## Deprecations -* Remove Gearpump runner. ([BEAM-9999](https://issues.apache.org/jira/browse/BEAM-9999)) -* Remove Apex runner. ([BEAM-9999](https://issues.apache.org/jira/browse/BEAM-9999)) -* RedisIO.readAll() is deprecated and will be removed in 2 versions, users must use RedisIO.readKeyPatterns() as a replacement ([BEAM-9747](https://issues.apache.org/jira/browse/BEAM-9747)). +- Remove Gearpump runner. ([BEAM-9999](https://issues.apache.org/jira/browse/BEAM-9999)) +- Remove Apex runner. ([BEAM-9999](https://issues.apache.org/jira/browse/BEAM-9999)) +- RedisIO.readAll() is deprecated and will be removed in 2 versions, users must use RedisIO.readKeyPatterns() as a replacement ([BEAM-9747](https://issues.apache.org/jira/browse/BEAM-9747)). ## Known Issues -* Fixed X (Java/Python) ([BEAM-X](https://issues.apache.org/jira/browse/BEAM-X)). +- Fixed X (Java/Python) ([BEAM-X](https://issues.apache.org/jira/browse/BEAM-X)). # [2.22.0] - 2020-06-08 @@ -468,159 +471,158 @@ ## I/Os -* Basic Kafka read/write support for DataflowRunner (Python) ([BEAM-8019](https://issues.apache.org/jira/browse/BEAM-8019)). -* Sources and sinks for Google Healthcare APIs (Java)([BEAM-9468](https://issues.apache.org/jira/browse/BEAM-9468)). -* Support for writing to Snowflake added (Java) ([BEAM-9894](https://issues.apache.org/jira/browse/BEAM-9894)). +- Basic Kafka read/write support for DataflowRunner (Python) ([BEAM-8019](https://issues.apache.org/jira/browse/BEAM-8019)). +- Sources and sinks for Google Healthcare APIs (Java)([BEAM-9468](https://issues.apache.org/jira/browse/BEAM-9468)). +- Support for writing to Snowflake added (Java) ([BEAM-9894](https://issues.apache.org/jira/browse/BEAM-9894)). ## New Features / Improvements -* `--workerCacheMB` flag is supported in Dataflow streaming pipeline ([BEAM-9964](https://issues.apache.org/jira/browse/BEAM-9964)) -* `--direct_num_workers=0` is supported for FnApi runner. It will set the number of threads/subprocesses to number of cores of the machine executing the pipeline ([BEAM-9443](https://issues.apache.org/jira/browse/BEAM-9443)). -* Python SDK now has experimental support for SqlTransform ([BEAM-8603](https://issues.apache.org/jira/browse/BEAM-8603)). -* Add OnWindowExpiration method to Stateful DoFn ([BEAM-1589](https://issues.apache.org/jira/browse/BEAM-1589)). -* Added PTransforms for Google Cloud DLP (Data Loss Prevention) services integration ([BEAM-9723](https://issues.apache.org/jira/browse/BEAM-9723)): - * Inspection of data, - * Deidentification of data, - * Reidentification of data. -* Add a more complete I/O support matrix in the documentation site ([BEAM-9916](https://issues.apache.org/jira/browse/BEAM-9916)). -* Upgrade Sphinx to 3.0.3 for building PyDoc. -* Added a PTransform for image annotation using Google Cloud AI image processing service -([BEAM-9646](https://issues.apache.org/jira/browse/BEAM-9646)) -* Dataflow streaming timers are not strictly time ordered when set earlier mid-bundle ([BEAM-8543](https://issues.apache.org/jira/browse/BEAM-8543)). +- `--workerCacheMB` flag is supported in Dataflow streaming pipeline ([BEAM-9964](https://issues.apache.org/jira/browse/BEAM-9964)) +- `--direct_num_workers=0` is supported for FnApi runner. It will set the number of threads/subprocesses to number of cores of the machine executing the pipeline ([BEAM-9443](https://issues.apache.org/jira/browse/BEAM-9443)). +- Python SDK now has experimental support for SqlTransform ([BEAM-8603](https://issues.apache.org/jira/browse/BEAM-8603)). +- Add OnWindowExpiration method to Stateful DoFn ([BEAM-1589](https://issues.apache.org/jira/browse/BEAM-1589)). +- Added PTransforms for Google Cloud DLP (Data Loss Prevention) services integration ([BEAM-9723](https://issues.apache.org/jira/browse/BEAM-9723)): + - Inspection of data, + - Deidentification of data, + - Reidentification of data. +- Add a more complete I/O support matrix in the documentation site ([BEAM-9916](https://issues.apache.org/jira/browse/BEAM-9916)). +- Upgrade Sphinx to 3.0.3 for building PyDoc. +- Added a PTransform for image annotation using Google Cloud AI image processing service + ([BEAM-9646](https://issues.apache.org/jira/browse/BEAM-9646)) +- Dataflow streaming timers are not strictly time ordered when set earlier mid-bundle ([BEAM-8543](https://issues.apache.org/jira/browse/BEAM-8543)). ## Breaking Changes -* The Python SDK now requires `--job_endpoint` to be set when using `--runner=PortableRunner` ([BEAM-9860](https://issues.apache.org/jira/browse/BEAM-9860)). Users seeking the old default behavior should set `--runner=FlinkRunner` instead. +- The Python SDK now requires `--job_endpoint` to be set when using `--runner=PortableRunner` ([BEAM-9860](https://issues.apache.org/jira/browse/BEAM-9860)). Users seeking the old default behavior should set `--runner=FlinkRunner` instead. ## Deprecations ## Known Issues - # [2.21.0] - 2020-05-27 ## Highlights ## I/Os -* Python: Deprecated module `apache_beam.io.gcp.datastore.v1` has been removed -as the client it uses is out of date and does not support Python 3 -([BEAM-9529](https://issues.apache.org/jira/browse/BEAM-9529)). -Please migrate your code to use -[apache_beam.io.gcp.datastore.**v1new**](https://beam.apache.org/releases/pydoc/current/apache_beam.io.gcp.datastore.v1new.datastoreio.html). -See the updated -[datastore_wordcount](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/examples/cookbook/datastore_wordcount.py) -for example usage. -* Python SDK: Added integration tests and updated batch write functionality for Google Cloud Spanner transform ([BEAM-8949](https://issues.apache.org/jira/browse/BEAM-8949)). + +- Python: Deprecated module `apache_beam.io.gcp.datastore.v1` has been removed + as the client it uses is out of date and does not support Python 3 + ([BEAM-9529](https://issues.apache.org/jira/browse/BEAM-9529)). + Please migrate your code to use + [apache_beam.io.gcp.datastore.**v1new**](https://beam.apache.org/releases/pydoc/current/apache_beam.io.gcp.datastore.v1new.datastoreio.html). + See the updated + [datastore_wordcount](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/examples/cookbook/datastore_wordcount.py) + for example usage. +- Python SDK: Added integration tests and updated batch write functionality for Google Cloud Spanner transform ([BEAM-8949](https://issues.apache.org/jira/browse/BEAM-8949)). ## New Features / Improvements -* Python SDK will now use Python 3 type annotations as pipeline type hints. -([#10717](https://github.com/apache/beam/pull/10717)) - - If you suspect that this feature is causing your pipeline to fail, calling - `apache_beam.typehints.disable_type_annotations()` before pipeline creation - will disable is completely, and decorating specific functions (such as - `process()`) with `@apache_beam.typehints.no_annotations` will disable it - for that function. - - More details will be in - [Ensuring Python Type Safety](https://beam.apache.org/documentation/sdks/python-type-safety/) - and an upcoming - [blog post](https://beam.apache.org/blog/python-typing/index.html). - -* Java SDK: Introducing the concept of options in Beam Schemas. These options add extra -context to fields and schemas. This replaces the current Beam metadata that is present -in a FieldType only, options are available in fields and row schemas. Schema options are -fully typed and can contain complex rows. *Remark: Schema aware is still experimental.* -([BEAM-9035](https://issues.apache.org/jira/browse/BEAM-9035)) -* Java SDK: The protobuf extension is fully schema aware and also includes protobuf option -conversion to beam schema options. *Remark: Schema aware is still experimental.* -([BEAM-9044](https://issues.apache.org/jira/browse/BEAM-9044)) -* Added ability to write to BigQuery via Avro file loads (Python) ([BEAM-8841](https://issues.apache.org/jira/browse/BEAM-8841)) - - By default, file loads will be done using JSON, but it is possible to - specify the temp_file_format parameter to perform file exports with AVRO. - AVRO-based file loads work by exporting Python types into Avro types, so - to switch to Avro-based loads, you will need to change your data types - from Json-compatible types (string-type dates and timestamp, long numeric - values as strings) into Python native types that are written to Avro - (Python's date, datetime types, decimal, etc). For more information - see https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-avro#avro_conversions. -* Added integration of Java SDK with Google Cloud AI VideoIntelligence service -([BEAM-9147](https://issues.apache.org/jira/browse/BEAM-9147)) -* Added integration of Java SDK with Google Cloud AI natural language processing API -([BEAM-9634](https://issues.apache.org/jira/browse/BEAM-9634)) -* `docker-pull-licenses` tag was introduced. Licenses/notices of third party dependencies will be added to the docker images when `docker-pull-licenses` was set. + +- Python SDK will now use Python 3 type annotations as pipeline type hints. + ([#10717](https://github.com/apache/beam/pull/10717)) + + If you suspect that this feature is causing your pipeline to fail, calling + `apache_beam.typehints.disable_type_annotations()` before pipeline creation + will disable is completely, and decorating specific functions (such as + `process()`) with `@apache_beam.typehints.no_annotations` will disable it + for that function. + + More details will be in + [Ensuring Python Type Safety](https://beam.apache.org/documentation/sdks/python-type-safety/) + and an upcoming + [blog post](https://beam.apache.org/blog/python-typing/index.html). + +- Java SDK: Introducing the concept of options in Beam Schemas. These options add extra + context to fields and schemas. This replaces the current Beam metadata that is present + in a FieldType only, options are available in fields and row schemas. Schema options are + fully typed and can contain complex rows. _Remark: Schema aware is still experimental._ + ([BEAM-9035](https://issues.apache.org/jira/browse/BEAM-9035)) +- Java SDK: The protobuf extension is fully schema aware and also includes protobuf option + conversion to beam schema options. _Remark: Schema aware is still experimental._ + ([BEAM-9044](https://issues.apache.org/jira/browse/BEAM-9044)) +- Added ability to write to BigQuery via Avro file loads (Python) ([BEAM-8841](https://issues.apache.org/jira/browse/BEAM-8841)) + + By default, file loads will be done using JSON, but it is possible to + specify the temp_file_format parameter to perform file exports with AVRO. + AVRO-based file loads work by exporting Python types into Avro types, so + to switch to Avro-based loads, you will need to change your data types + from Json-compatible types (string-type dates and timestamp, long numeric + values as strings) into Python native types that are written to Avro + (Python's date, datetime types, decimal, etc). For more information + see https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-avro#avro_conversions. + +- Added integration of Java SDK with Google Cloud AI VideoIntelligence service + ([BEAM-9147](https://issues.apache.org/jira/browse/BEAM-9147)) +- Added integration of Java SDK with Google Cloud AI natural language processing API + ([BEAM-9634](https://issues.apache.org/jira/browse/BEAM-9634)) +- `docker-pull-licenses` tag was introduced. Licenses/notices of third party dependencies will be added to the docker images when `docker-pull-licenses` was set. The files are added to `/opt/apache/beam/third_party_licenses/`. By default, no licenses/notices are added to the docker images. ([BEAM-9136](https://issues.apache.org/jira/browse/BEAM-9136)) - ## Breaking Changes -* Dataflow runner now requires the `--region` option to be set, unless a default value is set in the environment ([BEAM-9199](https://issues.apache.org/jira/browse/BEAM-9199)). See [here](https://cloud.google.com/dataflow/docs/concepts/regional-endpoints) for more details. -* HBaseIO.ReadAll now requires a PCollection of HBaseIO.Read objects instead of HBaseQuery objects ([BEAM-9279](https://issues.apache.org/jira/browse/BEAM-9279)). -* ProcessContext.updateWatermark has been removed in favor of using a WatermarkEstimator ([BEAM-9430](https://issues.apache.org/jira/browse/BEAM-9430)). -* Coder inference for PCollection of Row objects has been disabled ([BEAM-9569](https://issues.apache.org/jira/browse/BEAM-9569)). -* Go SDK docker images are no longer released until further notice. +- Dataflow runner now requires the `--region` option to be set, unless a default value is set in the environment ([BEAM-9199](https://issues.apache.org/jira/browse/BEAM-9199)). See [here](https://cloud.google.com/dataflow/docs/concepts/regional-endpoints) for more details. +- HBaseIO.ReadAll now requires a PCollection of HBaseIO.Read objects instead of HBaseQuery objects ([BEAM-9279](https://issues.apache.org/jira/browse/BEAM-9279)). +- ProcessContext.updateWatermark has been removed in favor of using a WatermarkEstimator ([BEAM-9430](https://issues.apache.org/jira/browse/BEAM-9430)). +- Coder inference for PCollection of Row objects has been disabled ([BEAM-9569](https://issues.apache.org/jira/browse/BEAM-9569)). +- Go SDK docker images are no longer released until further notice. ## Deprecations -* Java SDK: Beam Schema FieldType.getMetadata is now deprecated and is replaced by the Beam -Schema Options, it will be removed in version `2.23.0`. ([BEAM-9704](https://issues.apache.org/jira/browse/BEAM-9704)) -* The `--zone` option in the Dataflow runner is now deprecated. Please use `--worker_zone` instead. ([BEAM-9716](https://issues.apache.org/jira/browse/BEAM-9716)) -## Known Issues +- Java SDK: Beam Schema FieldType.getMetadata is now deprecated and is replaced by the Beam + Schema Options, it will be removed in version `2.23.0`. ([BEAM-9704](https://issues.apache.org/jira/browse/BEAM-9704)) +- The `--zone` option in the Dataflow runner is now deprecated. Please use `--worker_zone` instead. ([BEAM-9716](https://issues.apache.org/jira/browse/BEAM-9716)) +## Known Issues # [2.20.0] - 2020-04-15 ## Highlights - ## I/Os -* Java SDK: Adds support for Thrift encoded data via ThriftIO. ([BEAM-8561](https://issues.apache.org/jira/browse/BEAM-8561)) -* Java SDK: KafkaIO supports schema resolution using Confluent Schema Registry. ([BEAM-7310](https://issues.apache.org/jira/browse/BEAM-7310)) -* Java SDK: Add Google Cloud Healthcare IO connectors: HL7v2IO and FhirIO ([BEAM-9468](https://issues.apache.org/jira/browse/BEAM-9468)) -* Python SDK: Support for Google Cloud Spanner. This is an experimental module for reading and writing data from Google Cloud Spanner ([BEAM-7246](https://issues.apache.org/jira/browse/BEAM-7246)). -* Python SDK: Adds support for standard HDFS URLs (with server name). ([#10223](https://github.com/apache/beam/pull/10223)). - +- Java SDK: Adds support for Thrift encoded data via ThriftIO. ([BEAM-8561](https://issues.apache.org/jira/browse/BEAM-8561)) +- Java SDK: KafkaIO supports schema resolution using Confluent Schema Registry. ([BEAM-7310](https://issues.apache.org/jira/browse/BEAM-7310)) +- Java SDK: Add Google Cloud Healthcare IO connectors: HL7v2IO and FhirIO ([BEAM-9468](https://issues.apache.org/jira/browse/BEAM-9468)) +- Python SDK: Support for Google Cloud Spanner. This is an experimental module for reading and writing data from Google Cloud Spanner ([BEAM-7246](https://issues.apache.org/jira/browse/BEAM-7246)). +- Python SDK: Adds support for standard HDFS URLs (with server name). ([#10223](https://github.com/apache/beam/pull/10223)). ## New Features / Improvements -* New AnnotateVideo & AnnotateVideoWithContext PTransform's that integrates GCP Video Intelligence functionality. (Python) ([BEAM-9146](https://issues.apache.org/jira/browse/BEAM-9146)) -* New AnnotateImage & AnnotateImageWithContext PTransform's for element-wise & batch image annotation using Google Cloud Vision API. (Python) ([BEAM-9247](https://issues.apache.org/jira/browse/BEAM-9247)) -* Added a PTransform for inspection and deidentification of text using Google Cloud DLP. (Python) ([BEAM-9258](https://issues.apache.org/jira/browse/BEAM-9258)) -* New AnnotateText PTransform that integrates Google Cloud Natural Language functionality (Python) ([BEAM-9248](https://issues.apache.org/jira/browse/BEAM-9248)) -* _ReadFromBigQuery_ now supports value providers for the query string (Python) ([BEAM-9305](https://issues.apache.org/jira/browse/BEAM-9305)) -* Direct runner for FnApi supports further parallelism (Python) ([BEAM-9228](https://issues.apache.org/jira/browse/BEAM-9228)) -* Support for _@RequiresTimeSortedInput_ in Flink and Spark (Java) ([BEAM-8550](https://issues.apache.org/jira/browse/BEAM-8550)) +- New AnnotateVideo & AnnotateVideoWithContext PTransform's that integrates GCP Video Intelligence functionality. (Python) ([BEAM-9146](https://issues.apache.org/jira/browse/BEAM-9146)) +- New AnnotateImage & AnnotateImageWithContext PTransform's for element-wise & batch image annotation using Google Cloud Vision API. (Python) ([BEAM-9247](https://issues.apache.org/jira/browse/BEAM-9247)) +- Added a PTransform for inspection and deidentification of text using Google Cloud DLP. (Python) ([BEAM-9258](https://issues.apache.org/jira/browse/BEAM-9258)) +- New AnnotateText PTransform that integrates Google Cloud Natural Language functionality (Python) ([BEAM-9248](https://issues.apache.org/jira/browse/BEAM-9248)) +- _ReadFromBigQuery_ now supports value providers for the query string (Python) ([BEAM-9305](https://issues.apache.org/jira/browse/BEAM-9305)) +- Direct runner for FnApi supports further parallelism (Python) ([BEAM-9228](https://issues.apache.org/jira/browse/BEAM-9228)) +- Support for _@RequiresTimeSortedInput_ in Flink and Spark (Java) ([BEAM-8550](https://issues.apache.org/jira/browse/BEAM-8550)) ## Breaking Changes -* ReadFromPubSub(topic=) in Python previously created a subscription under the same project as the topic. Now it will create the subscription under the project specified in pipeline_options. If the project is not specified in pipeline_options, then it will create the subscription under the same project as the topic. ([BEAM-3453](https://issues.apache.org/jira/browse/BEAM-3453)). -* SpannerAccessor in Java is now package-private to reduce API surface. `SpannerConfig.connectToSpanner` has been moved to `SpannerAccessor.create`. ([BEAM-9310](https://issues.apache.org/jira/browse/BEAM-9310)). -* ParquetIO hadoop dependency should be now provided by the users ([BEAM-8616](https://issues.apache.org/jira/browse/BEAM-8616)). -* Docker images will be deployed to +- ReadFromPubSub(topic=) in Python previously created a subscription under the same project as the topic. Now it will create the subscription under the project specified in pipeline_options. If the project is not specified in pipeline_options, then it will create the subscription under the same project as the topic. ([BEAM-3453](https://issues.apache.org/jira/browse/BEAM-3453)). +- SpannerAccessor in Java is now package-private to reduce API surface. `SpannerConfig.connectToSpanner` has been moved to `SpannerAccessor.create`. ([BEAM-9310](https://issues.apache.org/jira/browse/BEAM-9310)). +- ParquetIO hadoop dependency should be now provided by the users ([BEAM-8616](https://issues.apache.org/jira/browse/BEAM-8616)). +- Docker images will be deployed to [apache/beam](https://hub.docker.com/search?q=apache%2Fbeam&type=image) repositories from 2.20. They - used to be deployed to - [apachebeam](https://hub.docker.com/search?q=apachebeam&type=image) repository. - ([BEAM-9063](https://issues.apache.org/jira/browse/BEAM-9093)) -* PCollections now have tags inferred from the result type (e.g. the keys of a dict or index of a tuple). Users may expect the old implementation which gave PCollection output ids a monotonically increasing id. To go back to the old implementation, use the `force_generated_pcollection_output_ids` experiment. + used to be deployed to + [apachebeam](https://hub.docker.com/search?q=apachebeam&type=image) repository. + ([BEAM-9063](https://issues.apache.org/jira/browse/BEAM-9093)) +- PCollections now have tags inferred from the result type (e.g. the keys of a dict or index of a tuple). Users may expect the old implementation which gave PCollection output ids a monotonically increasing id. To go back to the old implementation, use the `force_generated_pcollection_output_ids` experiment. ## Deprecations ## Bugfixes -* Fixed numpy operators in ApproximateQuantiles (Python) ([BEAM-9579](https://issues.apache.org/jira/browse/BEAM-9579)). -* Fixed exception when running in IPython notebook (Python) ([BEAM-X9277](https://issues.apache.org/jira/browse/BEAM-9277)). -* Fixed Flink uberjar job termination bug. ([BEAM-9225](https://issues.apache.org/jira/browse/BEAM-9225)) -* Fixed SyntaxError in process worker startup ([BEAM-9503](https://issues.apache.org/jira/browse/BEAM-9503)) -* Key should be available in @OnTimer methods (Java) ([BEAM-1819](https://issues.apache.org/jira/browse/BEAM-1819)). +- Fixed numpy operators in ApproximateQuantiles (Python) ([BEAM-9579](https://issues.apache.org/jira/browse/BEAM-9579)). +- Fixed exception when running in IPython notebook (Python) ([BEAM-X9277](https://issues.apache.org/jira/browse/BEAM-9277)). +- Fixed Flink uberjar job termination bug. ([BEAM-9225](https://issues.apache.org/jira/browse/BEAM-9225)) +- Fixed SyntaxError in process worker startup ([BEAM-9503](https://issues.apache.org/jira/browse/BEAM-9503)) +- Key should be available in @OnTimer methods (Java) ([BEAM-1819](https://issues.apache.org/jira/browse/BEAM-1819)). ## Known Issues -* ([BEAM-X](https://issues.apache.org/jira/browse/BEAM-X)). -* ([BEAM-9322](https://issues.apache.org/jira/browse/BEAM-9322)). -* Python SDK `pre_optimize=all` experiment may cause error ([BEAM-9445](https://issues.apache.org/jira/browse/BEAM-9445)) +- ([BEAM-X](https://issues.apache.org/jira/browse/BEAM-X)). +- ([BEAM-9322](https://issues.apache.org/jira/browse/BEAM-9322)). +- Python SDK `pre_optimize=all` experiment may cause error ([BEAM-9445](https://issues.apache.org/jira/browse/BEAM-9445)) # [2.19.0] - 2020-01-31 diff --git a/buildSrc/build.gradle.kts b/buildSrc/build.gradle.kts index 60d037b61ace..5ba809615c48 100644 --- a/buildSrc/build.gradle.kts +++ b/buildSrc/build.gradle.kts @@ -25,12 +25,25 @@ plugins { // Define the set of repositories required to fetch and enable plugins. repositories { - jcenter() + mavenCentral() + mavenLocal() maven { url = uri("https://plugins.gradle.org/m2/") } maven { - url = uri("https://repo.spring.io/plugins-release/") + url = uri("https://repo.spring.io/plugins-snapshot/") content { includeGroup("io.spring.gradle") } } + maven { + url = uri("https://${System.getenv("ARTIFACTORY_URL")}/maven-anthology") + credentials { + username = System.getenv("ARTIFACTORY_USERNAME") + password = System.getenv("ARTIFACTORY_PASSWORD") + } + isAllowInsecureProtocol = false + } + maven { + url = uri("https://jitpack.io") + content { includeGroup("com.github.davidmc24.gradle-avro-plugin") } + } } // Dependencies on other plugins used when this plugin is invoked @@ -42,8 +55,8 @@ dependencies { runtime("net.ltgt.gradle:gradle-apt-plugin:0.21") // Enable a Java annotation processor runtime("com.google.protobuf:protobuf-gradle-plugin:0.8.13") // Enable proto code generation - runtime("io.spring.gradle:propdeps-plugin:0.0.9.RELEASE") // Enable provided and optional configurations - runtime("com.commercehub.gradle.plugin:gradle-avro-plugin:0.11.0") // Enable Avro code generation + runtime("io.spring.gradle:propdeps-plugin:0.0.9.BUILD-SNAPSHOT") // Enable provided and optional configurations + runtime("com.commercehub.gradle.plugin:gradle-avro-plugin:0.16.0") // Enable Avro code generation runtime("com.diffplug.spotless:spotless-plugin-gradle:5.6.1") // Enable a code formatting plugin runtime("gradle.plugin.com.github.blindpirate:gogradle:0.11.4") // Enable Go code compilation runtime("gradle.plugin.com.palantir.gradle.docker:gradle-docker:0.22.0") // Enable building Docker containers @@ -53,8 +66,8 @@ dependencies { runtime("net.linguica.gradle:maven-settings-plugin:0.5") runtime("gradle.plugin.io.pry.gradle.offline_dependencies:gradle-offline-dependencies-plugin:0.5.0") // Enable creating an offline repository runtime("net.ltgt.gradle:gradle-errorprone-plugin:1.2.1") // Enable errorprone Java static analysis - runtime("org.ajoberstar.grgit:grgit-gradle:4.0.2") // Enable website git publish to asf-site branch - runtime("com.avast.gradle:gradle-docker-compose-plugin:0.13.2") // Enable docker compose tasks + runtime("org.ajoberstar.grgit:grgit-gradle:4.1.1") // Enable website git publish to asf-site branch + runtime("com.avast.gradle:gradle-docker-compose-plugin:0.16.2") // Enable docker compose tasks runtime("ca.cutterslade.gradle:gradle-dependency-analyze:1.4.3") // Enable dep analysis runtime("gradle.plugin.net.ossindex:ossindex-gradle-plugin:0.4.11") // Enable dep vulnerability analysis runtime("org.checkerframework:checkerframework-gradle-plugin:0.5.16") // Enable enhanced static checking plugin diff --git a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy index b2a2d4bcd508..210dcb01a353 100644 --- a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy +++ b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy @@ -105,7 +105,7 @@ class BeamModulePlugin implements Plugin { Map classesTriggerCheckerBugs = [:] /** Controls whether the dependency analysis plugin is enabled. */ - boolean enableStrictDependencies = true + boolean enableStrictDependencies = false /** Override the default "beam-" + `dash separated path` archivesBaseName. */ String archivesBaseName = null @@ -372,7 +372,7 @@ class BeamModulePlugin implements Plugin { // Automatically use the official release version if we are performing a release // otherwise append '-SNAPSHOT' - project.version = '2.33.0' + project.version = '2.33.4' if (!isRelease(project)) { project.version += '-SNAPSHOT' } @@ -699,28 +699,41 @@ class BeamModulePlugin implements Plugin { name "testPublicationLocal" url "file://${project.rootProject.projectDir}/testPublication/" } - maven { - url(project.properties['distMgmtSnapshotsUrl'] ?: isRelease(project) - ? 'https://repository.apache.org/service/local/staging/deploy/maven2' - : 'https://repository.apache.org/content/repositories/snapshots') - name(project.properties['distMgmtServerId'] ?: isRelease(project) - ? 'apache.releases.https' : 'apache.snapshots.https') - // The maven settings plugin will load credentials from ~/.m2/settings.xml file that a user - // has configured with the Apache release and snapshot staging credentials. - // - // - // - // apache.releases.https - // USER_TOKEN - // PASS_TOKEN - // - // - // apache.snapshots.https - // USER_TOKEN - // PASS_TOKEN - // - // - // + // Use Artifactory if configured, otherwise use Apache repositories + if (System.env.ARTIFACTORY_URL) { + maven { + name "artifactory" + url = "https://${System.env.ARTIFACTORY_URL}/maven-anthology" + credentials { + username = System.env.ARTIFACTORY_USERNAME + password = System.env.ARTIFACTORY_PASSWORD + } + allowInsecureProtocol = false + } + } else { + maven { + url(project.properties['distMgmtSnapshotsUrl'] ?: isRelease(project) + ? 'https://repository.apache.org/service/local/staging/deploy/maven2' + : 'https://repository.apache.org/content/repositories/snapshots') + name(project.properties['distMgmtServerId'] ?: isRelease(project) + ? 'apache.releases.https' : 'apache.snapshots.https') + // The maven settings plugin will load credentials from ~/.m2/settings.xml file that a user + // has configured with the Apache release and snapshot staging credentials. + // + // + // + // apache.releases.https + // USER_TOKEN + // PASS_TOKEN + // + // + // apache.snapshots.https + // USER_TOKEN + // PASS_TOKEN + // + // + // + } } } @@ -1747,7 +1760,13 @@ class BeamModulePlugin implements Plugin { project.apply plugin: 'base' project.apply plugin: "com.github.blindpirate.gogradle" - project.golang { goVersion = '1.16.5' } + // Use existing Go installation to avoid deprecated download URL + def goRootPath = System.getenv('GOROOT') ?: new File(System.getProperty('user.home'), '.gradle/go/1.16.5').absolutePath + if (new File(goRootPath).exists()) { + project.golang { goRoot = goRootPath } + } else { + project.golang { goVersion = '1.16.5' } + } project.repositories { golang { @@ -2502,4 +2521,4 @@ class BeamModulePlugin implements Plugin { } } } -} +} \ No newline at end of file diff --git a/buildSrc/src/main/groovy/org/apache/beam/gradle/Repositories.groovy b/buildSrc/src/main/groovy/org/apache/beam/gradle/Repositories.groovy index 89b68a56025d..f3be83284b68 100644 --- a/buildSrc/src/main/groovy/org/apache/beam/gradle/Repositories.groovy +++ b/buildSrc/src/main/groovy/org/apache/beam/gradle/Repositories.groovy @@ -38,7 +38,6 @@ class Repositories { mavenCentral() mavenLocal() - jcenter() // For pentaho dependencies. maven { @@ -61,16 +60,20 @@ class Repositories { content { includeGroup "io.confluent" } } - // For textiq maven { - url "http://maven-aws.textiq.net:8080/repository/internal" - content { allowInsecureProtocol = true } + name "artifactory" + url = "https://${System.env.ARTIFACTORY_URL}/maven-anthology" + credentials { + username = System.env.ARTIFACTORY_USERNAME + password = System.env.ARTIFACTORY_PASSWORD + } + allowInsecureProtocol = false } } // plugin to support repository authentication via ~/.m2/settings.xml // https://github.com/mark-vieira/gradle-maven-settings-plugin/ - project.apply plugin: 'net.linguica.maven-settings' + // project.apply plugin: 'net.linguica.maven-settings' // Disabled due to missing settings-security.xml // Apply a plugin which provides the 'updateOfflineRepository' task that creates an offline // repository. This offline repository satisfies all Gradle build dependencies and Java @@ -84,9 +87,8 @@ class Repositories { repositories { mavenLocal() mavenCentral() - jcenter() maven { url "https://plugins.gradle.org/m2/" } - maven { url "https://repo.spring.io/plugins-release" } + maven { url "https://repo.spring.io/plugins-snapshot" } maven { url "https://public.nexus.pentaho.org/repository/proxy-public-3rd-party-release" } maven { url "https://packages.confluent.io/maven/" } maven { url project.offlineRepositoryRoot } diff --git a/buildSrc/src/main/groovy/org/apache/beam/gradle/VendorJavaPlugin.groovy b/buildSrc/src/main/groovy/org/apache/beam/gradle/VendorJavaPlugin.groovy index 1d17560acabd..1d3793cd84a1 100644 --- a/buildSrc/src/main/groovy/org/apache/beam/gradle/VendorJavaPlugin.groovy +++ b/buildSrc/src/main/groovy/org/apache/beam/gradle/VendorJavaPlugin.groovy @@ -187,28 +187,41 @@ artifactId=${project.name} name "testPublicationLocal" url "file://${project.rootProject.projectDir}/testPublication/" } - maven { - url(project.properties['distMgmtSnapshotsUrl'] ?: isRelease(project) - ? 'https://repository.apache.org/service/local/staging/deploy/maven2' - : 'https://repository.apache.org/content/repositories/snapshots') - name(project.properties['distMgmtServerId'] ?: isRelease(project) - ? 'apache.releases.https' : 'apache.snapshots.https') - // The maven settings plugin will load credentials from ~/.m2/settings.xml file that a user - // has configured with the Apache release and snapshot staging credentials. - // - // - // - // apache.releases.https - // USER_TOKEN - // PASS_TOKEN - // - // - // apache.snapshots.https - // USER_TOKEN - // PASS_TOKEN - // - // - // + // Use Artifactory if configured, otherwise use Apache repositories + if (System.env.ARTIFACTORY_URL) { + maven { + name "artifactory" + url = "https://${System.env.ARTIFACTORY_URL}/maven-anthology" + credentials { + username = System.env.ARTIFACTORY_USERNAME + password = System.env.ARTIFACTORY_PASSWORD + } + allowInsecureProtocol = false + } + } else { + maven { + url(project.properties['distMgmtSnapshotsUrl'] ?: isRelease(project) + ? 'https://repository.apache.org/service/local/staging/deploy/maven2' + : 'https://repository.apache.org/content/repositories/snapshots') + name(project.properties['distMgmtServerId'] ?: isRelease(project) + ? 'apache.releases.https' : 'apache.snapshots.https') + // The maven settings plugin will load credentials from ~/.m2/settings.xml file that a user + // has configured with the Apache release and snapshot staging credentials. + // + // + // + // apache.releases.https + // USER_TOKEN + // PASS_TOKEN + // + // + // apache.snapshots.https + // USER_TOKEN + // PASS_TOKEN + // + // + // + } } } diff --git a/gradle.properties b/gradle.properties index f8699d435767..dc40409d3138 100644 --- a/gradle.properties +++ b/gradle.properties @@ -24,8 +24,8 @@ offlineRepositoryRoot=offline-repository signing.gnupg.executable=gpg signing.gnupg.useLegacyGpg=true -version=2.33.0-textiq -sdk_version=2.33.0.dev-textiq +version=2.33.4-textiq +sdk_version=2.33.4.dev-textiq javaVersion=1.8 diff --git a/runners/google-cloud-dataflow-java/build.gradle b/runners/google-cloud-dataflow-java/build.gradle index 969b2b306eb3..3627a8b9b160 100644 --- a/runners/google-cloud-dataflow-java/build.gradle +++ b/runners/google-cloud-dataflow-java/build.gradle @@ -45,8 +45,8 @@ processResources { filter org.apache.tools.ant.filters.ReplaceTokens, tokens: [ 'dataflow.legacy_environment_major_version' : '8', 'dataflow.fnapi_environment_major_version' : '8', - 'dataflow.legacy_container_version' : 'beam-2.33.0', - 'dataflow.fnapi_container_version' : 'beam-2.33.0', + 'dataflow.legacy_container_version' : 'beam-2.33.4', + 'dataflow.fnapi_container_version' : 'beam-2.33.4', 'dataflow.container_base_repository' : 'gcr.io/cloud-dataflow/v1beta3', ] } diff --git a/sdks/go/pkg/beam/core/core.go b/sdks/go/pkg/beam/core/core.go index 34fe071733b7..dd1c8448abeb 100644 --- a/sdks/go/pkg/beam/core/core.go +++ b/sdks/go/pkg/beam/core/core.go @@ -27,5 +27,5 @@ const ( // SdkName is the human readable name of the SDK for UserAgents. SdkName = "Apache Beam SDK for Go" // SdkVersion is the current version of the SDK. - SdkVersion = "2.33.0.dev" + SdkVersion = "2.33.4.dev" ) diff --git a/sdks/java/core/build.gradle b/sdks/java/core/build.gradle index d884ec04f465..0bfbcaa65f46 100644 --- a/sdks/java/core/build.gradle +++ b/sdks/java/core/build.gradle @@ -100,6 +100,7 @@ dependencies { shadowTest library.java.mockito_core shadowTest library.java.hamcrest_core shadowTest library.java.hamcrest_library + shadowTest library.java.hamcrest shadowTest "com.esotericsoftware.kryo:kryo:2.21" shadowTest library.java.quickcheck_core shadowTest library.java.avro_tests diff --git a/sdks/java/extensions/sql/jdbc/build.gradle b/sdks/java/extensions/sql/jdbc/build.gradle index a1c99cf53c1b..60ac89ec8dc2 100644 --- a/sdks/java/extensions/sql/jdbc/build.gradle +++ b/sdks/java/extensions/sql/jdbc/build.gradle @@ -72,7 +72,7 @@ task shadowJarTest(type: Test, dependsOn: ":sdks:java:extensions:sql:jdbc:shadow outputs.upToDateWhen { false } systemProperty "driver.jar", configurations.shadowTestRuntimeClasspath[1] - + include '**/JdbcJarTest.class' classpath = configurations.integrationTest useJUnit { } diff --git a/sdks/java/io/azure/build.gradle b/sdks/java/io/azure/build.gradle index 18596e4bd6b0..b628852e9760 100644 --- a/sdks/java/io/azure/build.gradle +++ b/sdks/java/io/azure/build.gradle @@ -26,7 +26,7 @@ applyJavaNature( description = "Apache Beam :: SDKs :: Java :: IO :: Azure" ext.summary = "IO library to read and write Azure services from Beam." -repositories { jcenter() } +repositories { mavenCentral() } dependencies { compile library.java.commons_io diff --git a/sdks/java/io/mongodb/src/main/java/org/apache/beam/sdk/io/mongodb/MongoDbIO.java b/sdks/java/io/mongodb/src/main/java/org/apache/beam/sdk/io/mongodb/MongoDbIO.java index 7b1945c99f1b..93ca9c87e5d8 100644 --- a/sdks/java/io/mongodb/src/main/java/org/apache/beam/sdk/io/mongodb/MongoDbIO.java +++ b/sdks/java/io/mongodb/src/main/java/org/apache/beam/sdk/io/mongodb/MongoDbIO.java @@ -426,7 +426,7 @@ private long getEstimatedSizeBytes( try { Document stats = mongoDatabase.runCommand(stat); return stats.get("size", Number.class).longValue(); - } catch (MongoCommandException e) { + } catch (MongoCommandException e) { // collection does not exist, size = 0 return 0; } @@ -478,7 +478,7 @@ public List> split( try { Document splitVectorCommandResult = mongoDatabase.runCommand(splitVectorCommand); splitKeys = (List) splitVectorCommandResult.get("splitKeys"); - }catch (MongoCommandException e) { + } catch (MongoCommandException e) { // collection does not exist: no split keys to work with splitKeys = Collections.EMPTY_LIST; } @@ -529,7 +529,7 @@ public List> split( } private static String getFilterString(BsonType type, String value) { - switch(type) { + switch (type) { case STRING: return "\"" + value + "\""; case OBJECT_ID: @@ -578,14 +578,14 @@ static List splitKeysToFilters(List splitKeys, BsonType idType if (i == 0) { // this is the first split in the list, the filter defines // the range from the beginning up to this split - rangeFilter = String.format("{ $and: [ {\"_id\":{$lte:%s}}", - getFilterString(idType, splitKey)); + rangeFilter = + String.format("{ $and: [ {\"_id\":{$lte:%s}}", getFilterString(idType, splitKey)); filters.add(String.format("%s ]}", rangeFilter)); // If there is only one split, also generate a range from the split to the end if (splitKeys.size() == 1) { - rangeFilter = String.format("{ $and: [ {\"_id\":{$gt:%s}}", - getFilterString(idType,splitKey)); + rangeFilter = + String.format("{ $and: [ {\"_id\":{$gt:%s}}", getFilterString(idType, splitKey)); filters.add(String.format("%s ]}", rangeFilter)); } } else if (i == splitKeys.size() - 1) { @@ -595,19 +595,17 @@ static List splitKeysToFilters(List splitKeys, BsonType idType rangeFilter = String.format( "{ $and: [ {\"_id\":{$gt:%s,$lte:%s}}", - getFilterString(idType,lowestBound), - getFilterString(idType, splitKey)); + getFilterString(idType, lowestBound), getFilterString(idType, splitKey)); filters.add(String.format("%s ]}", rangeFilter)); - rangeFilter = String.format("{ $and: [ {\"_id\":{$gt:%s}}", - getFilterString(idType, splitKey)); + rangeFilter = + String.format("{ $and: [ {\"_id\":{$gt:%s}}", getFilterString(idType, splitKey)); filters.add(String.format("%s ]}", rangeFilter)); } else { // we are between two splits rangeFilter = String.format( "{ $and: [ {\"_id\":{$gt:%s," + "$lte:%s}}", - getFilterString(idType, lowestBound), - getFilterString(idType, splitKey)); + getFilterString(idType, lowestBound), getFilterString(idType, splitKey)); filters.add(String.format("%s ]}", rangeFilter)); } diff --git a/sdks/java/io/mongodb/src/test/java/org/apache/beam/sdk/io/mongodb/MongoDbIOTest.java b/sdks/java/io/mongodb/src/test/java/org/apache/beam/sdk/io/mongodb/MongoDbIOTest.java index 8e0b49436cb0..811a7cf2c418 100644 --- a/sdks/java/io/mongodb/src/test/java/org/apache/beam/sdk/io/mongodb/MongoDbIOTest.java +++ b/sdks/java/io/mongodb/src/test/java/org/apache/beam/sdk/io/mongodb/MongoDbIOTest.java @@ -125,7 +125,8 @@ public void testSplitIntoFilters() { // A single split will result in two filters ArrayList documents = new ArrayList<>(); documents.add(new Document("_id", 56)); - List filters = MongoDbIO.BoundedMongoDbSource.splitKeysToFilters(documents, BsonType.OBJECT_ID); + List filters = + MongoDbIO.BoundedMongoDbSource.splitKeysToFilters(documents, BsonType.OBJECT_ID); assertEquals(2, filters.size()); assertEquals("{ $and: [ {\"_id\":{$lte:ObjectId(\"56\")}} ]}", filters.get(0)); assertEquals("{ $and: [ {\"_id\":{$gt:ObjectId(\"56\")}} ]}", filters.get(1)); @@ -268,30 +269,34 @@ public void testReadWithFilter() { @Test public void testReadWithProjection() { PCollection output = - pipeline.apply( - MongoDbIO.read() - .withUri("mongodb://localhost:" + port) - .withDatabase(DATABASE) - .withCollection(COLLECTION) - .withQueryFn(FindQuery.create().withProjection(Lists.newArrayList("scientist")))); + pipeline.apply( + MongoDbIO.read() + .withUri("mongodb://localhost:" + port) + .withDatabase(DATABASE) + .withCollection(COLLECTION) + .withQueryFn(FindQuery.create().withProjection(Lists.newArrayList("scientist")))); PAssert.thatSingleton(output.apply("Count All", Count.globally())).isEqualTo(1000L); PAssert.thatSingleton( - output - .apply("Filter docs with country field present", - Filter.by((SerializableFunction) - input -> input.getString("country") != null)) + output + .apply( + "Filter docs with country field present", + Filter.by( + (SerializableFunction) + input -> input.getString("country") != null)) .apply("CountWithCountry", Count.globally())) - .isEqualTo(0L); + .isEqualTo(0L); PAssert.thatSingleton( - output - .apply("Filter docs with scientist field present", - Filter.by((SerializableFunction) - input -> input.getString("scientist") != null)) + output + .apply( + "Filter docs with scientist field present", + Filter.by( + (SerializableFunction) + input -> input.getString("scientist") != null)) .apply("CountWithScientist", Count.globally())) - .isEqualTo(1000L); + .isEqualTo(1000L); pipeline.run(); } diff --git a/sdks/python/apache_beam/version.py b/sdks/python/apache_beam/version.py index 27d47f22fb24..a90533ae00b6 100644 --- a/sdks/python/apache_beam/version.py +++ b/sdks/python/apache_beam/version.py @@ -17,4 +17,4 @@ """Apache Beam SDK version information and utilities.""" -__version__ = '2.33.0.dev' +__version__ = '2.33.4.dev' diff --git a/vendor/grpc-1_36_0/build.gradle b/vendor/grpc-1_36_0/build.gradle index b4971fde7d67..0a857f3b1954 100644 --- a/vendor/grpc-1_36_0/build.gradle +++ b/vendor/grpc-1_36_0/build.gradle @@ -16,9 +16,21 @@ * limitations under the License. */ +plugins { id 'org.apache.beam.vendor-java' } + import org.apache.beam.gradle.GrpcVendoring_1_36_0 -plugins { id 'org.apache.beam.vendor-java' } +repositories { + mavenLocal() + maven { + url = uri("https://${System.env.ARTIFACTORY_URL}/maven-anthology") + credentials { + username = System.env.ARTIFACTORY_USERNAME + password = System.env.ARTIFACTORY_PASSWORD + } + allowInsecureProtocol = false + } +} description = "Apache Beam :: Vendored Dependencies :: gRPC :: 1.36.0"