Feature Imputation¶

Feature Imputation imputes missing features on dense instances using user-specified method(s) and value(s). Imputation can be done on select columns using arbitrary methods.

Param¶

`feature_imputation_param` ¶

Classes¶

`FeatureImputationParam(default_value=0, missing_fill_method=None, col_missing_fill_method=None, missing_impute=None, need_run=True)` ¶

Bases: BaseParam

Define feature imputation parameters

Parameters:

Name	Type	Description	Default
`default_value`	`None or single object type or list`	the value to replace missing value. if None, it will use default value defined in federatedml/feature/imputer.py, if single object, will fill missing value with this object, if list, it's length should be the same as input data' feature dimension, means that if some column happens to have missing values, it will replace it the value by element in the identical position of this list.	`0`
`missing_fill_method`	`[None, min, max, mean, designated]`	the method to replace missing value	`None`
`col_missing_fill_method`		specifies method to replace missing value for each column; any column not specified will take missing_fill_method, if missing_fill_method is None, unspecified column will not be imputed;	`None`
`missing_impute`	`None or list`	element of list can be any type, or auto generated if value is None, define which values to be consider as missing, default: None	`None`
`need_run`		need run or not	`True`

Source code in python/federatedml/param/feature_imputation_param.py

def __init__(self, default_value=0, missing_fill_method=None, col_missing_fill_method=None,
             missing_impute=None, need_run=True):
    super(FeatureImputationParam, self).__init__()
    self.default_value = default_value
    self.missing_fill_method = missing_fill_method
    self.col_missing_fill_method = col_missing_fill_method
    self.missing_impute = missing_impute
    self.need_run = need_run

Attributes¶

default_value = default_value instance-attribute ¶

missing_fill_method = missing_fill_method instance-attribute ¶

col_missing_fill_method = col_missing_fill_method instance-attribute ¶

missing_impute = missing_impute instance-attribute ¶

need_run = need_run instance-attribute ¶

Functions¶

check() ¶

Source code in python/federatedml/param/feature_imputation_param.py

def check(self):

    descr = "feature imputation param's "

    self.check_boolean(self.need_run, descr + "need_run")

    if self.missing_fill_method is not None:
        self.missing_fill_method = self.check_and_change_lower(self.missing_fill_method,
                                                               ['min', 'max', 'mean', 'designated'],
                                                               f"{descr}missing_fill_method ")
    if self.col_missing_fill_method:
        if not isinstance(self.col_missing_fill_method, dict):
            raise ValueError(f"{descr}col_missing_fill_method should be a dict")
        for k, v in self.col_missing_fill_method.items():
            if not isinstance(k, str):
                raise ValueError(f"{descr}col_missing_fill_method should contain str key(s) only")
            v = self.check_and_change_lower(v,
                                            ['min', 'max', 'mean', 'designated'],
                                            f"per column method specified in {descr} col_missing_fill_method dict")
            self.col_missing_fill_method[k] = v
    if self.missing_impute:
        if not isinstance(self.missing_impute, list):
            raise ValueError(f"{descr}missing_impute must be None or list.")

    return True

Examples¶

Example

Pipeline

## Feature Imputation Pipeline Examples Usage Guide.

This section introduces the pipeline scripts of feature imputation tasks.

#### Pipeline Tasks

1. Feature Imputation using Designated Replace Value for All Columns:
    script: pipeline-feature-imputation-designated.py

2. Feature Imputation using the Same Method for All Columns:
    script: pipeline-feature-imputation-method.py

3. Feature Imputation using Different Methods for Different Columns:
    script: pipeline-feature-imputation-column-method.py

Users can use following commands to running the task.

    python ${pipeline_script}

pipeline-feature-imputation-designated.py

import argparse
from pipeline.utils.tools import load_job_config
from pipeline.backend.pipeline import PipeLine
from pipeline.component import DataTransform
from pipeline.component import FeatureImputation
from pipeline.component import Intersection
from pipeline.component import Reader
from pipeline.interface import Data


def main(config="../../config.yaml", namespace=""):
    # obtain config
    if isinstance(config, str):
        config = load_job_config(config)
    parties = config.parties
    guest = parties.guest[0]
    host = parties.host[0]

    guest_train_data = {"name": "breast_hetero_guest", "namespace": f"experiment{namespace}"}
    host_train_data = {"name": "breast_hetero_host", "namespace": f"experiment{namespace}"}

    pipeline = PipeLine().set_initiator(role='guest', party_id=guest).set_roles(guest=guest, host=host)

    reader_0 = Reader(name="reader_0")
    reader_0.get_party_instance(role='guest', party_id=guest).component_param(table=guest_train_data)
    reader_0.get_party_instance(role='host', party_id=host).component_param(table=host_train_data)

    data_transform_0 = DataTransform(name="data_transform_0", with_label=False)

    intersection_0 = Intersection(name="intersection_0")
    feature_imputation_0 = FeatureImputation(name="feature_imputation_0",
                                             missing_fill_method="designated",
                                             default_value=42, missing_impute=[0])

    pipeline.add_component(reader_0)
    pipeline.add_component(data_transform_0, data=Data(data=reader_0.output.data))
    pipeline.add_component(intersection_0, data=Data(data=data_transform_0.output.data))
    pipeline.add_component(feature_imputation_0, data=Data(data=intersection_0.output.data))
    pipeline.compile()

    pipeline.fit()

    # predict
    # deploy required components
    pipeline.deploy_component([data_transform_0, intersection_0,
                               feature_imputation_0])

    predict_pipeline = PipeLine()
    # add data reader onto predict pipeline
    predict_pipeline.add_component(reader_0)
    # add selected components from train pipeline onto predict pipeline
    # specify data source
    predict_pipeline.add_component(
        pipeline, data=Data(
            predict_input={
                pipeline.data_transform_0.input.data: reader_0.output.data}))
    # run predict model
    predict_pipeline.predict()


if __name__ == "__main__":
    parser = argparse.ArgumentParser("PIPELINE DEMO")
    parser.add_argument("-config", type=str,
                        help="config file")
    args = parser.parse_args()
    if args.config is not None:
        main(args.config)
    else:
        main()

pipeline-feature-imputation-column-method.py

import argparse
from pipeline.utils.tools import load_job_config
from pipeline.backend.pipeline import PipeLine
from pipeline.component import DataTransform
from pipeline.component import FeatureImputation
from pipeline.component import Intersection
from pipeline.component import Reader
from pipeline.interface import Data


def main(config="../../config.yaml", namespace=""):
    # obtain config
    if isinstance(config, str):
        config = load_job_config(config)
    parties = config.parties
    guest = parties.guest[0]
    host = parties.host[0]

    guest_train_data = {"name": "dvisits_hetero_guest", "namespace": f"experiment{namespace}"}
    host_train_data = {"name": "dvisits_hetero_host", "namespace": f"experiment{namespace}"}

    pipeline = PipeLine().set_initiator(role='guest', party_id=guest).set_roles(guest=guest, host=host)

    reader_0 = Reader(name="reader_0")
    reader_0.get_party_instance(role='guest', party_id=guest).component_param(table=guest_train_data)
    reader_0.get_party_instance(role='host', party_id=host).component_param(table=host_train_data)

    data_transform_0 = DataTransform(name="data_transform_0", with_label=False)

    intersection_0 = Intersection(name="intersection_0")
    feature_imputation_0 = FeatureImputation(name="feature_imputation_0",
                                             default_value=42,
                                             missing_impute=[0])
    feature_imputation_0.get_party_instance(role='guest', party_id=guest).component_param(
        col_missing_fill_method={"doctorco": "min",
                                 "hscore": "designated"})

    pipeline.add_component(reader_0)
    pipeline.add_component(data_transform_0, data=Data(data=reader_0.output.data))
    pipeline.add_component(intersection_0, data=Data(data=data_transform_0.output.data))
    pipeline.add_component(feature_imputation_0, data=Data(data=intersection_0.output.data))
    pipeline.compile()

    pipeline.fit()

    # predict
    # deploy required components
    pipeline.deploy_component([data_transform_0, intersection_0,
                               feature_imputation_0])

    predict_pipeline = PipeLine()
    # add data reader onto predict pipeline
    predict_pipeline.add_component(reader_0)
    # add selected components from train pipeline onto predict pipeline
    # specify data source
    predict_pipeline.add_component(
        pipeline, data=Data(
            predict_input={
                pipeline.data_transform_0.input.data: reader_0.output.data}))
    # run predict model
    predict_pipeline.predict()


if __name__ == "__main__":
    parser = argparse.ArgumentParser("PIPELINE DEMO")
    parser.add_argument("-config", type=str,
                        help="config file")
    args = parser.parse_args()
    if args.config is not None:
        main(args.config)
    else:
        main()

feature_imputation_testsuite.json

{
    "data": [
        {
            "file": "examples/data/breast_hetero_host.csv",
            "head": 1,
            "partition": 4,
            "table_name": "breast_hetero_host",
            "namespace": "experiment",
            "role": "host_0"
        },
        {
            "file": "examples/data/breast_hetero_guest.csv",
            "head": 1,
            "partition": 4,
            "table_name": "breast_hetero_guest",
            "namespace": "experiment",
            "role": "guest_0"
        },
        {
            "file": "examples/data/dvisits_hetero_host.csv",
            "head": 1,
            "partition": 4,
            "table_name": "dvisits_hetero_host",
            "namespace": "experiment",
            "role": "host_0"
        },
        {
            "file": "examples/data/dvisits_hetero_guest.csv",
            "head": 1,
            "partition": 4,
            "table_name": "dvisits_hetero_guest",
            "namespace": "experiment",
            "role": "guest_0"
        }
    ],
    "pipeline_tasks": {
        "designated": {
            "script": "pipeline-feature-imputation-designated.py"
        },
        "method": {
            "script": "pipeline-feature-imputation-method.py"
        },
        "diff-method-per-column": {
            "script": "pipeline-feature-imputation-column-method.py"
        }
    }
}

init.py

pipeline-feature-imputation-method.py

import argparse
from pipeline.utils.tools import load_job_config
from pipeline.backend.pipeline import PipeLine
from pipeline.component import DataTransform
from pipeline.component import FeatureImputation
from pipeline.component import Intersection
from pipeline.component import Reader
from pipeline.interface import Data, Model


def main(config="../../config.yaml", namespace=""):
    # obtain config
    if isinstance(config, str):
        config = load_job_config(config)
    parties = config.parties
    guest = parties.guest[0]
    host = parties.host[0]

    guest_train_data = {"name": "dvisits_hetero_guest", "namespace": f"experiment{namespace}"}
    host_train_data = {"name": "dvisits_hetero_host", "namespace": f"experiment{namespace}"}

    pipeline = PipeLine().set_initiator(role='guest', party_id=guest).set_roles(guest=guest, host=host)

    reader_0 = Reader(name="reader_0")
    reader_0.get_party_instance(role='guest', party_id=guest).component_param(table=guest_train_data)
    reader_0.get_party_instance(role='host', party_id=host).component_param(table=host_train_data)

    data_transform_0 = DataTransform(name="data_transform_0", with_label=False, output_format="dense")

    intersection_0 = Intersection(name="intersection_0")
    feature_imputation_0 = FeatureImputation(name="feature_imputation_0", missing_fill_method="max", missing_impute=[0])
    feature_imputation_1 = FeatureImputation(name="feature_imputation_1")

    pipeline.add_component(reader_0)
    pipeline.add_component(data_transform_0, data=Data(data=reader_0.output.data))
    pipeline.add_component(intersection_0, data=Data(data=data_transform_0.output.data))
    pipeline.add_component(feature_imputation_0, data=Data(data=intersection_0.output.data))
    pipeline.add_component(feature_imputation_1,
                           data=Data(data=intersection_0.output.data),
                           model=Model(model=feature_imputation_0.output.model))
    pipeline.compile()

    pipeline.fit()

    # predict
    # deploy required components
    pipeline.deploy_component([data_transform_0, intersection_0,
                               feature_imputation_0])

    predict_pipeline = PipeLine()
    # add data reader onto predict pipeline
    predict_pipeline.add_component(reader_0)
    # add selected components from train pipeline onto predict pipeline
    # specify data source
    predict_pipeline.add_component(
        pipeline, data=Data(
            predict_input={
                pipeline.data_transform_0.input.data: reader_0.output.data}))
    # run predict model
    predict_pipeline.predict()


if __name__ == "__main__":
    parser = argparse.ArgumentParser("PIPELINE DEMO")
    parser.add_argument("-config", type=str,
                        help="config file")
    args = parser.parse_args()
    if args.config is not None:
        main(args.config)
    else:
        main()

DSL

## Feature Imputation Configuration Usage Guide.

#### Example Tasks

This section introduces the dsl and conf for different types of tasks.

1. Feature Imputation using Designated Replace Value for All Columns:

    example-data: (1) guest: breast_hetero_guest.csv (2) host: breast_hetero_host.csv  

    dsl: feature_imputation_job_dsl.json

    runtime_config : feature_imputation_designated_conf.json

2. Feature Imputation using the Same Method for All Columns:

    example-data: (1) guest: dvisits_hetero_guest.csv (2) host: dvisists_hetero_host.csv  

    dsl: feature_imputation_job_dsl.json

    runtime_config : feature_imputation_method_conf.json

3. Feature Imputation using Different Methods for Different Columns:

    example-data: (1) guest: dvisits_hetero_guest.csv (2) host: dvisists_hetero_host.csv  

    dsl: feature_imputation_job_dsl.json

    runtime_config : feature_imputation_column_method_conf.json

Users can use following commands to run a task.

    flow job submit -c ${runtime_config} -d ${dsl}

Moreover, after successfully running the training task, you can use it to predict too.

feature_imputation_designated_job_conf.json

{
    "dsl_version": 2,
    "initiator": {
        "role": "guest",
        "party_id": 9999
    },
    "role": {
        "guest": [
            9999
        ],
        "host": [
            10000
        ]
    },
    "component_parameters": {
        "role": {
            "host": {
                "0": {
                    "reader_0": {
                        "table": {
                            "name": "breast_hetero_host",
                            "namespace": "experiment"
                        }
                    }
                }
            },
            "guest": {
                "0": {
                    "reader_0": {
                        "table": {
                            "name": "breast_hetero_guest",
                            "namespace": "experiment"
                        }
                    }
                }
            }
        },
        "common": {
            "data_transform_0": {
                "with_label": false
            },
            "feature_imputation_0": {
                "missing_fill_method": "designated",
                "default_value": 42,
                "missing_impute": [
                    0
                ]
            }
        }
    }
}

feature_imputation_column_method_job_conf.json

{
    "dsl_version": 2,
    "initiator": {
        "role": "guest",
        "party_id": 9999
    },
    "role": {
        "guest": [
            9999
        ],
        "host": [
            10000
        ]
    },
    "component_parameters": {
        "role": {
            "host": {
                "0": {
                    "reader_0": {
                        "table": {
                            "name": "dvisits_hetero_host",
                            "namespace": "experiment"
                        }
                    }
                }
            },
            "guest": {
                "0": {
                    "reader_0": {
                        "table": {
                            "name": "dvisits_hetero_guest",
                            "namespace": "experiment"
                        }
                    },
                    "feature_imputation_0": {
                        "col_missing_fill_method": {
                            "doctorco": "min",
                            "hscore": "designated"
                        }
                    }
                }
            }
        },
        "common": {
            "data_transform_0": {
                "with_label": false
            },
            "feature_imputation_0": {
                "missing_impute": [
                    0
                ],
                "default_value": 42
            }
        }
    }
}

feature_imputation_method_job_dsl.json

{
    "components": {
        "reader_0": {
            "module": "Reader",
            "output": {
                "data": [
                    "data"
                ]
            }
        },
        "data_transform_0": {
            "module": "DataTransform",
            "input": {
                "data": {
                    "data": [
                        "reader_0.data"
                    ]
                }
            },
            "output": {
                "data": [
                    "data"
                ],
                "model": [
                    "model"
                ]
            }
        },
        "intersection_0": {
            "module": "Intersection",
            "input": {
                "data": {
                    "data": [
                        "data_transform_0.data"
                    ]
                }
            },
            "output": {
                "data": [
                    "data"
                ]
            }
        },
        "feature_imputation_0": {
            "module": "FeatureImputation",
            "input": {
                "data": {
                    "data": [
                        "intersection_0.data"
                    ]
                }
            },
            "output": {
                "data": [
                    "data"
                ],
                "model": [
                    "model"
                ]
            }
        },
        "feature_imputation_1": {
            "module": "FeatureImputation",
            "input": {
                "data": {
                    "data": [
                        "intersection_0.data"
                    ]
                },
                "model": [
                    "feature_imputation_0.model"
                ]
            },
            "output": {
                "data": [
                    "data"
                ],
                "model": [
                    "model"
                ]
            }
        }
    }
}

feature_imputation_job_dsl.json

{
    "components": {
        "reader_0": {
            "module": "Reader",
            "output": {
                "data": [
                    "data"
                ]
            }
        },
        "data_transform_0": {
            "module": "DataTransform",
            "input": {
                "data": {
                    "data": [
                        "reader_0.data"
                    ]
                }
            },
            "output": {
                "data": [
                    "data"
                ],
                "model": [
                    "model"
                ]
            }
        },
        "intersection_0": {
            "module": "Intersection",
            "input": {
                "data": {
                    "data": [
                        "data_transform_0.data"
                    ]
                }
            },
            "output": {
                "data": [
                    "data"
                ]
            }
        },
        "feature_imputation_0": {
            "module": "FeatureImputation",
            "input": {
                "data": {
                    "data": [
                        "intersection_0.data"
                    ]
                }
            },
            "output": {
                "data": [
                    "data"
                ],
                "model": [
                    "model"
                ]
            }
        }
    }
}

feature_imputation_testsuite.json

{
    "data": [
        {
            "file": "examples/data/breast_hetero_host.csv",
            "head": 1,
            "partition": 16,
            "table_name": "breast_hetero_host",
            "namespace": "experiment",
            "role": "host_0"
        },
        {
            "file": "examples/data/breast_hetero_guest.csv",
            "head": 1,
            "partition": 16,
            "table_name": "breast_hetero_guest",
            "namespace": "experiment",
            "role": "guest_0"
        },
        {
            "file": "examples/data/dvisits_hetero_host.csv",
            "head": 1,
            "partition": 4,
            "table_name": "dvisits_hetero_host",
            "namespace": "experiment",
            "role": "host_0"
        },
        {
            "file": "examples/data/dvisits_hetero_guest.csv",
            "head": 1,
            "partition": 4,
            "table_name": "dvisits_hetero_guest",
            "namespace": "experiment",
            "role": "guest_0"
        }
    ],
    "tasks": {
        "designated": {
            "conf": "./feature_imputation_designated_job_conf.json",
            "dsl": "./feature_imputation_job_dsl.json"
        },
        "method": {
            "conf": "./feature_imputation_method_job_conf.json",
            "dsl": "./feature_imputation_method_job_dsl.json"
        },
        "diff-method-per-column": {
            "conf": "./feature_imputation_column_method_job_conf.json",
            "dsl": "./feature_imputation_job_dsl.json"
        }
    }
}

feature_imputation_method_job_conf.json

{
    "dsl_version": 2,
    "initiator": {
        "role": "guest",
        "party_id": 9999
    },
    "role": {
        "guest": [
            9999
        ],
        "host": [
            10000
        ]
    },
    "component_parameters": {
        "role": {
            "host": {
                "0": {
                    "reader_0": {
                        "table": {
                            "name": "dvisits_hetero_host",
                            "namespace": "experiment"
                        }
                    }
                }
            },
            "guest": {
                "0": {
                    "reader_0": {
                        "table": {
                            "name": "dvisits_hetero_guest",
                            "namespace": "experiment"
                        }
                    }
                }
            }
        },
        "common": {
            "data_transform_0": {
                "with_label": false
            },
            "feature_imputation_0": {
                "missing_fill_method": "max",
                "missing_impute": [
                    0
                ]
            }
        }
    }
}

最后更新: 2021-11-15

Feature Imputation¶

Param¶