Population Stability Index (PSI)¶
Introduction¶
Population stability index (PSI) is a metric to measure how much a feature has shifted in distribution between two sample sets. Usually, PSI is used to measure the stability of models or qualities of features. In FATE, PSI module is used to compute PSI values of features between two tables.
Given two data columns: expect & actual, PSI will be computed by the following steps: * expect column and actual column conduct quantile feature binning * compute interval percentage, which is given by (bin sample count)/(total sample number) * compute PSI value: psi = sum( (actual_percentage - expect_percentage) * ln(actual_percentage / expect_percentage) )
For more details of psi, you can refer to this ..
Param¶
- max_bin_num: int, max bin number of quantile feature binning
- need_run: bool, need to run this module in DSL
- dense_missing_val: int, float, string imputed missing value when input format is dense, default is set to np.nan. Default setting is suggested
Examples¶
Example
## PSI Example Usage Guide.
#### Example Tasks
This section introduces the Pipeline scripts for different types of tasks.
1. compute PSI values between expect/actual table:
script: pipeline-psi.py
Users can run a pipeline job directly:
python ${pipeline_script}
init.py
pipeline-psi.py
import argparse
from pipeline.backend.pipeline import PipeLine
from pipeline.component import DataTransform
from pipeline.component import PSI
from pipeline.component import Reader
from pipeline.interface import Data
from pipeline.interface import Model
from pipeline.utils.tools import load_job_config
def main(config="../../config.yaml", namespace=""):
# obtain config
if isinstance(config, str):
config = load_job_config(config)
parties = config.parties
guest = parties.guest[0]
host = parties.host[0]
guest_train_data = {"name": "expect", "namespace": f"experiment{namespace}"}
host_train_data = {"name": "actual", "namespace": f"experiment{namespace}"}
pipeline = PipeLine().set_initiator(role='guest', party_id=guest).set_roles(guest=guest, host=host)
reader_0 = Reader(name="reader_0")
reader_0.get_party_instance(role='guest', party_id=guest).component_param(table=guest_train_data)
reader_0.get_party_instance(role='host', party_id=host).component_param(table=host_train_data)
reader_1 = Reader(name="reader_1")
reader_1.get_party_instance(role='guest', party_id=guest).component_param(table=guest_train_data)
reader_1.get_party_instance(role='host', party_id=host).component_param(table=host_train_data)
data_transform_0 = DataTransform(name="data_transform_0")
data_transform_1 = DataTransform(name="data_transform_1")
data_transform_0.get_party_instance(
role='guest', party_id=guest).component_param(
with_label=False, output_format="dense")
data_transform_1.get_party_instance(
role='guest', party_id=guest).component_param(
with_label=False, output_format="dense")
data_transform_0.get_party_instance(
role='host', party_id=host).component_param(
with_label=False, output_format="dense")
data_transform_1.get_party_instance(
role='host', party_id=host).component_param(
with_label=False, output_format="dense")
psi_0 = PSI(name='psi_0', max_bin_num=20)
pipeline.add_component(reader_0)
pipeline.add_component(reader_1)
pipeline.add_component(data_transform_0, data=Data(data=reader_0.output.data))
pipeline.add_component(
data_transform_1, data=Data(
data=reader_1.output.data), model=Model(
data_transform_0.output.model))
pipeline.add_component(
psi_0,
data=Data(
train_data=data_transform_0.output.data,
validate_data=data_transform_1.output.data))
pipeline.compile()
pipeline.fit()
if __name__ == "__main__":
parser = argparse.ArgumentParser("PIPELINE DEMO")
parser.add_argument("-config", type=str,
help="config file")
args = parser.parse_args()
if args.config is not None:
main(args.config)
else:
main()
psi_testsuite.json
{
"data": [
{
"file": "examples/data/breast_homo_guest.csv",
"head": 1,
"partition": 16,
"table_name": "expect",
"namespace": "experiment",
"role": "host_0"
},
{
"file": "examples/data/breast_homo_host.csv",
"head": 1,
"partition": 16,
"table_name": "actual",
"namespace": "experiment",
"role": "host_0"
},
{
"file": "examples/data/breast_homo_guest.csv",
"head": 1,
"partition": 16,
"table_name": "expect",
"namespace": "experiment",
"role": "guest_0"
},
{
"file": "examples/data/breast_homo_host.csv",
"head": 1,
"partition": 16,
"table_name": "actual",
"namespace": "experiment",
"role": "guest_0"
}
],
"pipeline_tasks": {
"psi": {
"script": "./pipeline-psi.py"
}
}
}
## PSI Configuration Usage Guide.
This section introduces the dsl and conf relationships for usage.
#### Example Task.
1. PSI:
example-data:
(1) guest: expect: breast_homo_guest.csv; actual: breast_homo_host.csv
(2) host: expect: breast_homo_guest.csv; actual: breast_homo_host.csv
dsl: psi_cpn_dsl.json
runtime_config: psi_cpn_conf.json
Users can use following commands to run a task.
flow job submit -c ${runtime_config} -d ${dsl}
Moreover, after successfully running the training task, you can use it to predict too.
psi_cpn_conf.json
{
"dsl_version": 2,
"initiator": {
"role": "guest",
"party_id": 9999
},
"role": {
"host": [
10000
],
"guest": [
9999
]
},
"component_parameters": {
"common": {
"psi_0": {
"max_bin_num": 20
}
},
"role": {
"guest": {
"0": {
"data_transform_1": {
"with_label": false,
"output_format": "dense"
},
"data_transform_0": {
"with_label": false,
"output_format": "dense"
},
"reader_0": {
"table": {
"name": "expect",
"namespace": "experiment"
}
},
"reader_1": {
"table": {
"name": "actual",
"namespace": "experiment"
}
}
}
},
"host": {
"0": {
"data_transform_1": {
"with_label": false,
"output_format": "dense"
},
"data_transform_0": {
"with_label": false,
"output_format": "dense"
},
"reader_0": {
"table": {
"name": "expect",
"namespace": "experiment"
}
},
"reader_1": {
"table": {
"name": "actual",
"namespace": "experiment"
}
}
}
}
}
}
}
psi_cpn_dsl.json
{
"components": {
"reader_0": {
"module": "Reader",
"output": {
"data": [
"data"
]
}
},
"reader_1": {
"module": "Reader",
"output": {
"data": [
"data"
]
}
},
"data_transform_0": {
"module": "DataTransform",
"input": {
"data": {
"data": [
"reader_0.data"
]
}
},
"output": {
"data": [
"data"
],
"model": [
"model"
]
}
},
"data_transform_1": {
"module": "DataTransform",
"input": {
"data": {
"data": [
"reader_1.data"
]
},
"model": [
"data_transform_0.model"
]
},
"output": {
"data": [
"data"
],
"model": [
"model"
]
}
},
"psi_0": {
"module": "PSI",
"input": {
"data": {
"train_data": [
"data_transform_0.data"
],
"validate_data": [
"data_transform_1.data"
]
}
},
"output": {
"data": [
"data"
],
"model": [
"model"
]
}
}
}
}
psi_testsuite.json
{
"data": [
{
"file": "examples/data/breast_homo_guest.csv",
"head": 1,
"partition": 16,
"table_name": "expect",
"namespace": "experiment",
"role": "host_0"
},
{
"file": "examples/data/breast_homo_host.csv",
"head": 1,
"partition": 16,
"table_name": "actual",
"namespace": "experiment",
"role": "host_0"
},
{
"file": "examples/data/breast_homo_guest.csv",
"head": 1,
"partition": 16,
"table_name": "expect",
"namespace": "experiment",
"role": "guest_0"
},
{
"file": "examples/data/breast_homo_host.csv",
"head": 1,
"partition": 16,
"table_name": "actual",
"namespace": "experiment",
"role": "guest_0"
}
],
"tasks": {
"psi": {
"conf": "psi_cpn_conf.json",
"dsl": "psi_cpn_dsl.json"
}
}
}
Last update:
2021-11-15