In [ ]:
Copied!
import marimo as mo
import marimo as mo
mloda demo: How can we make feature engineering shareable?¶
Define dummy data as plugin¶
In [ ]:
Copied!
import numpy as np
from mloda.provider import FeatureGroup, DataCreator
class DummyData(FeatureGroup):
@classmethod
def calculate_feature(cls, data, features):
n_samples = features.get_options_key("n_samples") or 100
return {
"age": np.random.randint(18, 80, n_samples),
"weight": np.random.normal(70, 15, n_samples),
"state": np.random.choice(["CA", "NY", "TX", "FL"], n_samples),
"gender": np.random.choice(["M", "F"], n_samples),
}
@classmethod
def input_data(cls):
return DataCreator({"age", "weight", "state", "gender"})
import numpy as np
from mloda.provider import FeatureGroup, DataCreator
class DummyData(FeatureGroup):
@classmethod
def calculate_feature(cls, data, features):
n_samples = features.get_options_key("n_samples") or 100
return {
"age": np.random.randint(18, 80, n_samples),
"weight": np.random.normal(70, 15, n_samples),
"state": np.random.choice(["CA", "NY", "TX", "FL"], n_samples),
"gender": np.random.choice(["M", "F"], n_samples),
}
@classmethod
def input_data(cls):
return DataCreator({"age", "weight", "state", "gender"})
Request mlodaAPI to create features¶
In [ ]:
Copied!
# We load dependencies.
from mloda.user import mloda, PluginLoader
PluginLoader.all()
# Load plugins into namespace so compute frameworks register.
_result = mloda.run_all(
["age", "weight", "state", "gender"], compute_frameworks=["PyArrowTable", "PandasDataFrame"]
)
print(_result)
# We load dependencies.
from mloda.user import mloda, PluginLoader
PluginLoader.all()
# Load plugins into namespace so compute frameworks register.
_result = mloda.run_all(
["age", "weight", "state", "gender"], compute_frameworks=["PyArrowTable", "PandasDataFrame"]
)
print(_result)
[pyarrow.Table age: int64 state: string weight: double gender: string ---- age: [[31,57,38,67,47,...,41,58,73,74,25]] state: [["FL","CA","NY","TX","TX",...,"CA","CA","NY","NY","FL"]] weight: [[53.07717743546068,47.84488437429319,68.9278583048704,97.40943188374412,80.39857279514968,...,75.42237139226059,62.266587953254444,64.40075366428398,75.37274314011698,98.47289476170835]] gender: [["M","F","M","F","F",...,"M","F","M","F","F"]]]
Chain features - automatic dependency resolution¶
In [ ]:
Copied!
# Load plugin into namespace again
_result = mloda.run_all(["age__sum_aggr"], compute_frameworks=["PolarsLazyDataFrame"])
print(_result)
# Load plugin into namespace again
_result = mloda.run_all(["age__sum_aggr"], compute_frameworks=["PolarsLazyDataFrame"])
print(_result)
[shape: (100, 1) ┌───────────────┐ │ age__sum_aggr │ │ --- │ │ i64 │ ╞═══════════════╡ │ 4763 │ │ 4763 │ │ 4763 │ │ 4763 │ │ 4763 │ │ … │ │ 4763 │ │ 4763 │ │ 4763 │ │ 4763 │ │ 4763 │ └───────────────┘]
As long as the plugins exists, we can run any datatransformation.
What is behind the "age__sum_aggr" syntax?¶
In [ ]:
Copied!
from mloda.user import Feature, Options
feature = Feature(
name="CustomConfiguration",
options=Options(context={"aggregation_type": "sum", "in_features": Feature("age", options={"n_samples": 5})}),
)
_result = mloda.run_all([feature], compute_frameworks=["PolarsLazyDataFrame"])
print(_result)
from mloda.user import Feature, Options
feature = Feature(
name="CustomConfiguration",
options=Options(context={"aggregation_type": "sum", "in_features": Feature("age", options={"n_samples": 5})}),
)
_result = mloda.run_all([feature], compute_frameworks=["PolarsLazyDataFrame"])
print(_result)
[shape: (5, 1) ┌─────────────────────┐ │ CustomConfiguration │ │ --- │ │ i64 │ ╞═════════════════════╡ │ 266 │ │ 266 │ │ 266 │ │ 266 │ │ 266 │ └─────────────────────┘]
How the chaining essentially works¶
class FeatureGroup(ABC):
def input_features(self, options: Options, feature_name: FeatureName) -> Optional[Set[Feature]]:
# In principle, the resolver checks if the feature group depends on another input feature
# -> then adds it to the chain of features which need to be resolved
if feature_name contains "input_feature__sum_aggr":
return input_feature
# How does mloda knows a feature matches a feature group?
# Customizable, but some good guesses
@classmethod
def match_feature_group_criteria(
cls,
feature_name: Union[FeatureName, str],
options: Options,
data_access_collection: Optional[DataAccessCollection] = None,
) -> bool:
Now we have chaining and matching. Why do we do this?¶
class FeatureGroup(ABC):
@classmethod
def calculate_feature(cls, data: Any, features: FeatureSet) -> Any:
\"\"\"
This function should be used to calculate the feature.
\"\"\"
# data is the incoming data from other feature dependencies or data via mloda
# features is the configuration
Business knowledge is in the data and in the configuration, but not in the plugin definition.¶
Big idea¶
Separate business logic from transformation logic:
- Plugins = generic transformations (shareable across companies)
- Data + Config = your business knowledge (stays private)
→ Stop rewriting "sum of a column" at every company
→ Build a shared ecosystem of feature engineering plugins