Source code for pyspark_pipeline_framework.runtime.dataflow.schema

"""Schema-aware DataFlow for components with input/output contracts."""

from __future__ import annotations

from typing import Any

from pyspark_pipeline_framework.runtime.dataflow.base import DataFlow


[docs] class SchemaAwareDataFlow(DataFlow): """DataFlow subclass that declares input and output schemas. Satisfies the ``SchemaContract`` protocol from ``core.component.protocols``. Schema types are ``Any`` until ``core.schema.SchemaDefinition`` is implemented (ppf-1si). Override ``input_schema`` and/or ``output_schema`` to declare contracts: >>> class MyTransform(SchemaAwareDataFlow): ... @property ... def name(self) -> str: ... return "typed-transform" ... ... @property ... def input_schema(self) -> Any: ... return {"col_a": "string", "col_b": "int"} ... ... def run(self) -> None: ... ... """ @property def input_schema(self) -> Any | None: """Schema this component expects as input, or None if undeclared.""" return None @property def output_schema(self) -> Any | None: """Schema this component produces as output, or None if undeclared.""" return None