跳转至

Population Stability Index (PSI)

Introduction

Population stability index (PSI) is a metric to measure how much a feature has shifted in distribution between two sample sets. Usually, PSI is used to measure the stability of models or qualities of features. In FATE, PSI module is used to compute PSI values of features between two tables.

Given two data columns: expect & actual, PSI will be computed by the following steps: * expect column and actual column conduct quantile feature binning * compute interval percentage, which is given by (bin sample count)/(total sample number) * compute PSI value: psi = sum( (actual_percentage - expect_percentage) * ln(actual_percentage / expect_percentage) )

For more details of psi, you can refer to this ..

Param

  • max_bin_num: int, max bin number of quantile feature binning
  • need_run: bool, need to run this module in DSL
  • dense_missing_val: int, float, string imputed missing value when input format is dense, default is set to np.nan. Default setting is suggested

Examples

Example
## PSI Example Usage Guide.

#### Example Tasks

This section introduces the Pipeline scripts for different types of tasks.

1. compute PSI values between expect/actual table:

    script: pipeline-psi.py

Users can run a pipeline job directly:

    python ${pipeline_script}
init.py

pipeline-psi.py
import argparse

from pipeline.backend.pipeline import PipeLine
from pipeline.component import DataTransform
from pipeline.component import PSI
from pipeline.component import Reader
from pipeline.interface import Data
from pipeline.interface import Model

from pipeline.utils.tools import load_job_config


def main(config="../../config.yaml", namespace=""):
    # obtain config
    if isinstance(config, str):

        config = load_job_config(config)
    parties = config.parties
    guest = parties.guest[0]
    host = parties.host[0]

    guest_train_data = {"name": "expect", "namespace": f"experiment{namespace}"}
    host_train_data = {"name": "actual", "namespace": f"experiment{namespace}"}

    pipeline = PipeLine().set_initiator(role='guest', party_id=guest).set_roles(guest=guest, host=host)

    reader_0 = Reader(name="reader_0")
    reader_0.get_party_instance(role='guest', party_id=guest).component_param(table=guest_train_data)
    reader_0.get_party_instance(role='host', party_id=host).component_param(table=host_train_data)

    reader_1 = Reader(name="reader_1")
    reader_1.get_party_instance(role='guest', party_id=guest).component_param(table=guest_train_data)
    reader_1.get_party_instance(role='host', party_id=host).component_param(table=host_train_data)

    data_transform_0 = DataTransform(name="data_transform_0")
    data_transform_1 = DataTransform(name="data_transform_1")

    data_transform_0.get_party_instance(
        role='guest', party_id=guest).component_param(
        with_label=False, output_format="dense")
    data_transform_1.get_party_instance(
        role='guest', party_id=guest).component_param(
        with_label=False, output_format="dense")

    data_transform_0.get_party_instance(
        role='host', party_id=host).component_param(
        with_label=False, output_format="dense")
    data_transform_1.get_party_instance(
        role='host', party_id=host).component_param(
        with_label=False, output_format="dense")

    psi_0 = PSI(name='psi_0', max_bin_num=20)

    pipeline.add_component(reader_0)
    pipeline.add_component(reader_1)
    pipeline.add_component(data_transform_0, data=Data(data=reader_0.output.data))
    pipeline.add_component(
        data_transform_1, data=Data(
            data=reader_1.output.data), model=Model(
            data_transform_0.output.model))
    pipeline.add_component(
        psi_0,
        data=Data(
            train_data=data_transform_0.output.data,
            validate_data=data_transform_1.output.data))

    pipeline.compile()

    pipeline.fit()


if __name__ == "__main__":
    parser = argparse.ArgumentParser("PIPELINE DEMO")
    parser.add_argument("-config", type=str,
                        help="config file")
    args = parser.parse_args()
    if args.config is not None:
        main(args.config)
    else:
        main()
psi_testsuite.json
{
    "data": [
        {
            "file": "examples/data/breast_homo_guest.csv",
            "head": 1,
            "partition": 16,
            "table_name": "expect",
            "namespace": "experiment",
            "role": "host_0"
        },
        {
            "file": "examples/data/breast_homo_host.csv",
            "head": 1,
            "partition": 16,
            "table_name": "actual",
            "namespace": "experiment",
            "role": "host_0"
        },
        {
            "file": "examples/data/breast_homo_guest.csv",
            "head": 1,
            "partition": 16,
            "table_name": "expect",
            "namespace": "experiment",
            "role": "guest_0"
        },
        {
            "file": "examples/data/breast_homo_host.csv",
            "head": 1,
            "partition": 16,
            "table_name": "actual",
            "namespace": "experiment",
            "role": "guest_0"
        }
    ],
    "pipeline_tasks": {
        "psi": {
            "script": "./pipeline-psi.py"
        }
    }
}
## PSI Configuration Usage Guide.

This section introduces the dsl and conf relationships for usage.

#### Example Task.

1. PSI:  

    example-data: 

    (1) guest: expect: breast_homo_guest.csv; actual: breast_homo_host.csv
    (2) host: expect: breast_homo_guest.csv; actual: breast_homo_host.csv

    dsl: psi_cpn_dsl.json  

    runtime_config: psi_cpn_conf.json  


Users can use following commands to run a task.

    flow job submit -c ${runtime_config} -d ${dsl}

Moreover, after successfully running the training task, you can use it to predict too.
psi_cpn_conf.json
{
    "dsl_version": 2,
    "initiator": {
        "role": "guest",
        "party_id": 9999
    },
    "role": {
        "host": [
            10000
        ],
        "guest": [
            9999
        ]
    },
    "component_parameters": {
        "common": {
            "psi_0": {
                "max_bin_num": 20
            }
        },
        "role": {
            "guest": {
                "0": {
                    "data_transform_1": {
                        "with_label": false,
                        "output_format": "dense"
                    },
                    "data_transform_0": {
                        "with_label": false,
                        "output_format": "dense"
                    },
                    "reader_0": {
                        "table": {
                            "name": "expect",
                            "namespace": "experiment"
                        }
                    },
                    "reader_1": {
                        "table": {
                            "name": "actual",
                            "namespace": "experiment"
                        }
                    }
                }
            },
            "host": {
                "0": {
                    "data_transform_1": {
                        "with_label": false,
                        "output_format": "dense"
                    },
                    "data_transform_0": {
                        "with_label": false,
                        "output_format": "dense"
                    },
                    "reader_0": {
                        "table": {
                            "name": "expect",
                            "namespace": "experiment"
                        }
                    },
                    "reader_1": {
                        "table": {
                            "name": "actual",
                            "namespace": "experiment"
                        }
                    }
                }
            }
        }
    }
}            
psi_cpn_dsl.json
{
    "components": {
        "reader_0": {
            "module": "Reader",
            "output": {
                "data": [
                    "data"
                ]
            }
        },
        "reader_1": {
            "module": "Reader",
            "output": {
                "data": [
                    "data"
                ]
            }
        },
        "data_transform_0": {
            "module": "DataTransform",
            "input": {
                "data": {
                    "data": [
                        "reader_0.data"
                    ]
                }
            },
            "output": {
                "data": [
                    "data"
                ],
                "model": [
                    "model"
                ]
            }
        },
        "data_transform_1": {
            "module": "DataTransform",
            "input": {
                "data": {
                    "data": [
                        "reader_1.data"
                    ]
                },
                "model": [
                    "data_transform_0.model"
                ]
            },
            "output": {
                "data": [
                    "data"
                ],
                "model": [
                    "model"
                ]
            }
        },
        "psi_0": {
            "module": "PSI",
            "input": {
                "data": {
                    "train_data": [
                        "data_transform_0.data"
                    ],
                    "validate_data": [
                        "data_transform_1.data"
                    ]
                }
            },
            "output": {
                "data": [
                    "data"
                ],
                "model": [
                    "model"
                ]
            }
        }
    }
}            
psi_testsuite.json
{
    "data": [
        {
            "file": "examples/data/breast_homo_guest.csv",
            "head": 1,
            "partition": 16,
            "table_name": "expect",
            "namespace": "experiment",
            "role": "host_0"
        },
        {
            "file": "examples/data/breast_homo_host.csv",
            "head": 1,
            "partition": 16,
            "table_name": "actual",
            "namespace": "experiment",
            "role": "host_0"
        },
        {
            "file": "examples/data/breast_homo_guest.csv",
            "head": 1,
            "partition": 16,
            "table_name": "expect",
            "namespace": "experiment",
            "role": "guest_0"
        },
        {
            "file": "examples/data/breast_homo_host.csv",
            "head": 1,
            "partition": 16,
            "table_name": "actual",
            "namespace": "experiment",
            "role": "guest_0"
        }
    ],
    "tasks": {
        "psi": {
            "conf": "psi_cpn_conf.json",
            "dsl": "psi_cpn_dsl.json"
        }
    }
}            

最后更新: 2021-11-15