# Heterogeneous Pearson Correlation Coefficient¶

## Introduction¶

Pearson Correlation Coefficient is a measure of the linear correlation between two variables, $X$ and $Y$, defined as,

\rho_{X,Y} = \frac{cov(X, Y)}{\sigma_X\sigma_Y} = \frac{E[(X-\mu_X)(Y-\mu_Y)]}{\sigma_X\sigma_Y} = E\left[\left(\frac{X-\mu_X}{\sigma_X}\cdot\frac{Y-\mu_Y}{\sigma_Y}\right)\right]

Let

\tilde{X} = \frac{X-\mu_X}{\sigma_X}, \tilde{Y}=\frac{Y-\mu_Y}{\sigma_Y}

then,

\rho_{X, Y} = E[\tilde{X}\tilde{Y}]

## Implementation Detail¶

We use an MPC protocol called SPDZ for Heterogeneous Pearson Correlation Coefficient calculation. For more details, one can refer [here]

## Param¶

### pearson_param¶

#### Classes¶

##### PearsonParam(column_names=None, column_indexes=None, cross_parties=True, need_run=True, use_mix_rand=False, calc_local_vif=True)¶

Bases: BaseParam

param for pearson correlation

Parameters:

Name Type Description Default
column_names list of string

list of column names

None
column_index list of int

list of column index

required
cross_parties bool, default

if True, calculate correlation of columns from both party

True
need_run bool

set False to skip this party

True
use_mix_rand bool, defalut

mix system random and pseudo random for quicker calculation

False
calc_loca_vif bool, default True

calculate VIF for columns in local

required
Source code in python/federatedml/param/pearson_param.py
 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 def __init__( self, column_names=None, column_indexes=None, cross_parties=True, need_run=True, use_mix_rand=False, calc_local_vif=True, ): super().__init__() self.column_names = column_names self.column_indexes = column_indexes self.cross_parties = cross_parties self.need_run = need_run self.use_mix_rand = use_mix_rand self.calc_local_vif = calc_local_vif 
###### Attributes¶
column_names = column_names instance-attribute
column_indexes = column_indexes instance-attribute
cross_parties = cross_parties instance-attribute
need_run = need_run instance-attribute
use_mix_rand = use_mix_rand instance-attribute
calc_local_vif = calc_local_vif instance-attribute
###### Functions¶
check()
Source code in python/federatedml/param/pearson_param.py
 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 def check(self): if not isinstance(self.use_mix_rand, bool): raise ValueError( f"use_mix_rand accept bool type only, {type(self.use_mix_rand)} got" ) if self.cross_parties and (not self.need_run): raise ValueError( f"need_run should be True(which is default) when cross_parties is True." ) self.column_indexes = [] if self.column_indexes is None else self.column_indexes self.column_names = [] if self.column_names is None else self.column_names if not isinstance(self.column_names, list): raise ValueError( f"type mismatch, column_names with type {type(self.column_names)}" ) for name in self.column_names: if not isinstance(name, str): raise ValueError( f"type mismatch, column_names with element {name}(type is {type(name)})" ) if isinstance(self.column_indexes, list): for idx in self.column_indexes: if not isinstance(idx, int): raise ValueError( f"type mismatch, column_indexes with element {idx}(type is {type(idx)})" ) if isinstance(self.column_indexes, int) and self.column_indexes != -1: raise ValueError( f"column_indexes with type int and value {self.column_indexes}(only -1 allowed)" ) if self.need_run: if isinstance(self.column_indexes, list) and isinstance( self.column_names, list ): if len(self.column_indexes) == 0 and len(self.column_names) == 0: raise ValueError(f"provide at least one column") 

## How to Use¶

• params

• column_indexes
-1 or list of int. If -1 provided, all columns are used for calculation. If a list of int provided, columns with given indexes are used for calculation.

• column_names
names of columns use for calculation.

Note

if both params are provided, the union of columns indicated are used for calculation.

## Examples¶

Example
## Hetero Pearson Pipeline Example Usage Guide.

This section introduces the Pipeline scripts for different types of tasks.

script: pipeline_hetero_pearson.py

script: pipeline_hetero_pearson_host_only.py

script: pipeline_hetero_pearson_sole.py

4. Use Mix Rand schema Task:

script: pipeline_hetero_pearson_mix_rand.py

Users can run a pipeline job directly:

python ${pipeline_script}  pipeline_hetero_pearson_mix_rand.py import argparse from pipeline.backend.pipeline import PipeLine from pipeline.component import DataTransform, HeteroPearson, Intersection, Reader from pipeline.interface import Data from pipeline.utils.tools import load_job_config def main(config="../../config.yaml", namespace=""): common_param = dict(column_indexes=-1, use_mix_rand=True) pipeline = run_pearson_pipeline( config=config, namespace=namespace, data=dataset.breast, common_param=common_param, ) print(pipeline.get_component("hetero_pearson_0").get_model_param()) print(pipeline.get_component("hetero_pearson_0").get_summary()) def run_pearson_pipeline( config, namespace, data, common_param=None, guest_only_param=None, host_only_param=None, ): if isinstance(config, str): config = load_job_config(config) guest_data = data["guest"] host_data = data["host"][0] guest_data["namespace"] = f"{guest_data['namespace']}{namespace}" host_data["namespace"] = f"{host_data['namespace']}{namespace}" pipeline = ( PipeLine() .set_initiator(role="guest", party_id=config.parties.guest[0]) .set_roles(guest=config.parties.guest[0], host=config.parties.host[0]) ) reader_0 = Reader(name="reader_0") reader_0.get_party_instance( role="guest", party_id=config.parties.guest[0] ).component_param(table=guest_data) reader_0.get_party_instance( role="host", party_id=config.parties.host[0] ).component_param(table=host_data) data_transform_0 = DataTransform(name="data_transform_0") data_transform_0.get_party_instance( role="guest", party_id=config.parties.guest[0] ).component_param(with_label=True, output_format="dense") data_transform_0.get_party_instance( role="host", party_id=config.parties.host[0] ).component_param(with_label=False) intersect_0 = Intersection(name="intersection_0") if common_param is None: common_param = {} hetero_pearson_component = HeteroPearson(name="hetero_pearson_0", **common_param) if guest_only_param: hetero_pearson_component.get_party_instance( "guest", config.parties.guest[0] ).component_param(**guest_only_param) if host_only_param: hetero_pearson_component.get_party_instance( "host", config.parties.host[0] ).component_param(**host_only_param) pipeline.add_component(reader_0) pipeline.add_component(data_transform_0, data=Data(data=reader_0.output.data)) pipeline.add_component(intersect_0, data=Data(data=data_transform_0.output.data)) pipeline.add_component( hetero_pearson_component, data=Data(train_data=intersect_0.output.data) ) pipeline.compile() pipeline.fit() return pipeline class dataset_meta(type): @property def breast(cls): return { "guest": {"name": "breast_hetero_guest", "namespace": "experiment"}, "host": [{"name": "breast_hetero_host", "namespace": "experiment"}], } class dataset(metaclass=dataset_meta): ... if __name__ == "__main__": parser = argparse.ArgumentParser("PIPELINE DEMO") parser.add_argument("-config", type=str, help="config file") args = parser.parse_args() if args.config is not None: main(args.config) else: main()  hetero_pearson_testsuite.json { "data": [ { "file": "examples/data/breast_hetero_guest.csv", "head": 1, "partition": 16, "table_name": "breast_hetero_guest", "namespace": "experiment", "role": "guest_0" }, { "file": "examples/data/breast_hetero_host.csv", "head": 1, "partition": 16, "table_name": "breast_hetero_host", "namespace": "experiment", "role": "host_0" } ], "pipeline_tasks": { "default": { "script": "./pipeline_hetero_pearson.py" }, "host_only": { "script": "./pipeline_hetero_pearson_host_only.py" }, "sole": { "script": "./pipeline_hetero_pearson_sole.py" }, "mix_rand": { "script": "./pipeline_hetero_pearson_mix_rand.py" } } }  init.py import os import sys additional_path = os.path.realpath("../") if additional_path not in sys.path: sys.path.append(additional_path)  pipeline_hetero_pearson_sole.py import argparse from pipeline.backend.pipeline import PipeLine from pipeline.component import DataTransform, HeteroPearson, Intersection, Reader from pipeline.interface import Data from pipeline.utils.tools import load_job_config def main(config="../../config.yaml", namespace=""): common_param = dict(column_indexes=-1, cross_parties=False) pipeline = run_pearson_pipeline( config=config, namespace=namespace, data=dataset.breast, common_param=common_param, ) print(pipeline.get_component("hetero_pearson_0").get_model_param()) print(pipeline.get_component("hetero_pearson_0").get_summary()) def run_pearson_pipeline( config, namespace, data, common_param=None, guest_only_param=None, host_only_param=None, ): if isinstance(config, str): config = load_job_config(config) guest_data = data["guest"] host_data = data["host"][0] guest_data["namespace"] = f"{guest_data['namespace']}{namespace}" host_data["namespace"] = f"{host_data['namespace']}{namespace}" pipeline = ( PipeLine() .set_initiator(role="guest", party_id=config.parties.guest[0]) .set_roles(guest=config.parties.guest[0], host=config.parties.host[0]) ) reader_0 = Reader(name="reader_0") reader_0.get_party_instance( role="guest", party_id=config.parties.guest[0] ).component_param(table=guest_data) reader_0.get_party_instance( role="host", party_id=config.parties.host[0] ).component_param(table=host_data) data_transform_0 = DataTransform(name="data_transform_0") data_transform_0.get_party_instance( role="guest", party_id=config.parties.guest[0] ).component_param(with_label=True, output_format="dense") data_transform_0.get_party_instance( role="host", party_id=config.parties.host[0] ).component_param(with_label=False) intersect_0 = Intersection(name="intersection_0") if common_param is None: common_param = {} hetero_pearson_component = HeteroPearson(name="hetero_pearson_0", **common_param) if guest_only_param: hetero_pearson_component.get_party_instance( "guest", config.parties.guest[0] ).component_param(**guest_only_param) if host_only_param: hetero_pearson_component.get_party_instance( "host", config.parties.host[0] ).component_param(**host_only_param) pipeline.add_component(reader_0) pipeline.add_component(data_transform_0, data=Data(data=reader_0.output.data)) pipeline.add_component(intersect_0, data=Data(data=data_transform_0.output.data)) pipeline.add_component( hetero_pearson_component, data=Data(train_data=intersect_0.output.data) ) pipeline.compile() pipeline.fit() return pipeline class dataset_meta(type): @property def breast(cls): return { "guest": {"name": "breast_hetero_guest", "namespace": "experiment"}, "host": [{"name": "breast_hetero_host", "namespace": "experiment"}], } class dataset(metaclass=dataset_meta): ... if __name__ == "__main__": parser = argparse.ArgumentParser("PIPELINE DEMO") parser.add_argument("-config", type=str, help="config file") args = parser.parse_args() if args.config is not None: main(args.config) else: main()  pipeline_hetero_pearson.py import argparse from pipeline.backend.pipeline import PipeLine from pipeline.component import DataTransform, HeteroPearson, Intersection, Reader from pipeline.interface import Data from pipeline.utils.tools import load_job_config def main(config="../../config.yaml", namespace=""): common_param = dict(column_indexes=-1) pipeline = run_pearson_pipeline( config=config, namespace=namespace, data=dataset.breast, common_param=common_param, ) print(pipeline.get_component("hetero_pearson_0").get_model_param()) print(pipeline.get_component("hetero_pearson_0").get_summary()) def run_pearson_pipeline( config, namespace, data, common_param=None, guest_only_param=None, host_only_param=None, ): if isinstance(config, str): config = load_job_config(config) guest_data = data["guest"] host_data = data["host"][0] guest_data["namespace"] = f"{guest_data['namespace']}{namespace}" host_data["namespace"] = f"{host_data['namespace']}{namespace}" pipeline = ( PipeLine() .set_initiator(role="guest", party_id=config.parties.guest[0]) .set_roles(guest=config.parties.guest[0], host=config.parties.host[0]) ) reader_0 = Reader(name="reader_0") reader_0.get_party_instance( role="guest", party_id=config.parties.guest[0] ).component_param(table=guest_data) reader_0.get_party_instance( role="host", party_id=config.parties.host[0] ).component_param(table=host_data) data_transform_0 = DataTransform(name="data_transform_0") data_transform_0.get_party_instance( role="guest", party_id=config.parties.guest[0] ).component_param(with_label=True, output_format="dense") data_transform_0.get_party_instance( role="host", party_id=config.parties.host[0] ).component_param(with_label=False) intersect_0 = Intersection(name="intersection_0") if common_param is None: common_param = {} hetero_pearson_component = HeteroPearson(name="hetero_pearson_0", **common_param) if guest_only_param: hetero_pearson_component.get_party_instance( "guest", config.parties.guest[0] ).component_param(**guest_only_param) if host_only_param: hetero_pearson_component.get_party_instance( "host", config.parties.host[0] ).component_param(**host_only_param) pipeline.add_component(reader_0) pipeline.add_component(data_transform_0, data=Data(data=reader_0.output.data)) pipeline.add_component(intersect_0, data=Data(data=data_transform_0.output.data)) pipeline.add_component( hetero_pearson_component, data=Data(train_data=intersect_0.output.data) ) pipeline.compile() pipeline.fit() return pipeline class dataset_meta(type): @property def breast(cls): return { "guest": {"name": "breast_hetero_guest", "namespace": "experiment"}, "host": [{"name": "breast_hetero_host", "namespace": "experiment"}], } class dataset(metaclass=dataset_meta): ... if __name__ == "__main__": parser = argparse.ArgumentParser("PIPELINE DEMO") parser.add_argument("-config", type=str, help="config file") args = parser.parse_args() if args.config is not None: main(args.config) else: main()  pipeline_hetero_pearson_host_only.py import argparse from pipeline.backend.pipeline import PipeLine from pipeline.component import DataTransform, HeteroPearson, Intersection, Reader from pipeline.interface import Data from pipeline.utils.tools import load_job_config def main(config="../../config.yaml", namespace=""): common_param = dict(column_indexes=-1, cross_parties=False) guest_only_param = dict(need_run=False) pipeline = run_pearson_pipeline( config=config, namespace=namespace, data=dataset.breast, common_param=common_param, guest_only_param=guest_only_param, ) def run_pearson_pipeline( config, namespace, data, common_param=None, guest_only_param=None, host_only_param=None, ): if isinstance(config, str): config = load_job_config(config) guest_data = data["guest"] host_data = data["host"][0] guest_data["namespace"] = f"{guest_data['namespace']}{namespace}" host_data["namespace"] = f"{host_data['namespace']}{namespace}" pipeline = ( PipeLine() .set_initiator(role="guest", party_id=config.parties.guest[0]) .set_roles(guest=config.parties.guest[0], host=config.parties.host[0]) ) reader_0 = Reader(name="reader_0") reader_0.get_party_instance( role="guest", party_id=config.parties.guest[0] ).component_param(table=guest_data) reader_0.get_party_instance( role="host", party_id=config.parties.host[0] ).component_param(table=host_data) data_transform_0 = DataTransform(name="data_transform_0") data_transform_0.get_party_instance( role="guest", party_id=config.parties.guest[0] ).component_param(with_label=True, output_format="dense") data_transform_0.get_party_instance( role="host", party_id=config.parties.host[0] ).component_param(with_label=False) intersect_0 = Intersection(name="intersection_0") if common_param is None: common_param = {} hetero_pearson_component = HeteroPearson(name="hetero_pearson_0", **common_param) if guest_only_param: hetero_pearson_component.get_party_instance( "guest", config.parties.guest[0] ).component_param(**guest_only_param) if host_only_param: hetero_pearson_component.get_party_instance( "host", config.parties.host[0] ).component_param(**host_only_param) pipeline.add_component(reader_0) pipeline.add_component(data_transform_0, data=Data(data=reader_0.output.data)) pipeline.add_component(intersect_0, data=Data(data=data_transform_0.output.data)) pipeline.add_component( hetero_pearson_component, data=Data(train_data=intersect_0.output.data) ) pipeline.compile() pipeline.fit() return pipeline class dataset_meta(type): @property def breast(cls): return { "guest": {"name": "breast_hetero_guest", "namespace": "experiment"}, "host": [{"name": "breast_hetero_host", "namespace": "experiment"}], } class dataset(metaclass=dataset_meta): ... if __name__ == "__main__": parser = argparse.ArgumentParser("PIPELINE DEMO") parser.add_argument("-config", type=str, help="config file") args = parser.parse_args() if args.config is not None: main(args.config) else: main()  ## Hetero Pearson Configuration Usage Guide. This section introduces the dsl and conf for usage of different type of task. #### Training Task. 1. Base Cross Parties Task: dsl: test_hetero_pearson_default_dsl.json runtime_config : test_hetero_pearson_default_conf.json 2. Host Only Task: dsl: test_hetero_pearson_host_only_dsl.json runtime_config : test_hetero_pearson_host_only_conf.json 3. Sole Task: dsl: test_hetero_pearson_sole_dsl.json runtime_config : test_hetero_pearson_sole_conf.json 4. Use Mix Rand Task: dsl: test_hetero_pearson_mix_rand_dsl.json runtime_config : test_hetero_pearson_mix_rand_conf.json Users can use following commands to run a task. flow job submit -c${runtime_config} -d \${dsl}

test_hetero_pearson_host_only_dsl.json
{
"components": {
"output": {
"data": [
"data"
]
}
},
"data_transform_0": {
"module": "DataTransform",
"input": {
"data": {
"data": [
]
}
},
"output": {
"data": [
"data"
],
"model": [
"model"
]
}
},
"intersection_0": {
"module": "Intersection",
"input": {
"data": {
"data": [
"data_transform_0.data"
]
}
},
"output": {
"data": [
"data"
]
}
},
"hetero_pearson_0": {
"module": "HeteroPearson",
"input": {
"data": {
"train_data": [
"intersection_0.data"
]
}
},
"output": {
"data": [
"data"
],
"model": [
"model"
]
}
}
}
}

test_hetero_pearson_sole_dsl.json
{
"components": {
"output": {
"data": [
"data"
]
}
},
"data_transform_0": {
"module": "DataTransform",
"input": {
"data": {
"data": [
]
}
},
"output": {
"data": [
"data"
],
"model": [
"model"
]
}
},
"intersection_0": {
"module": "Intersection",
"input": {
"data": {
"data": [
"data_transform_0.data"
]
}
},
"output": {
"data": [
"data"
]
}
},
"hetero_pearson_0": {
"module": "HeteroPearson",
"input": {
"data": {
"train_data": [
"intersection_0.data"
]
}
},
"output": {
"data": [
"data"
],
"model": [
"model"
]
}
}
}
}

test_hetero_pearson_mix_rand_dsl.json
{
"components": {
"output": {
"data": [
"data"
]
}
},
"data_transform_0": {
"module": "DataTransform",
"input": {
"data": {
"data": [
]
}
},
"output": {
"data": [
"data"
],
"model": [
"model"
]
}
},
"intersection_0": {
"module": "Intersection",
"input": {
"data": {
"data": [
"data_transform_0.data"
]
}
},
"output": {
"data": [
"data"
]
}
},
"hetero_pearson_0": {
"module": "HeteroPearson",
"input": {
"data": {
"train_data": [
"intersection_0.data"
]
}
},
"output": {
"data": [
"data"
],
"model": [
"model"
]
}
}
}
}

test_hetero_pearson_mix_rand_conf.json
{
"dsl_version": 2,
"initiator": {
"role": "guest",
"party_id": 9999
},
"role": {
"host": [
10000
],
"guest": [
9999
]
},
"component_parameters": {
"common": {
"hetero_pearson_0": {
"column_indexes": -1,
"use_mix_rand": true
}
},
"role": {
"host": {
"0": {
"data_transform_0": {
"with_label": false
},
"table": {
"name": "breast_hetero_host",
"namespace": "experiment"
}
}
}
},
"guest": {
"0": {
"data_transform_0": {
"with_label": true,
"output_format": "dense"
},
"table": {
"name": "breast_hetero_guest",
"namespace": "experiment"
}
}
}
}
}
}
}

hetero_pearson_testsuite.json
{
"data": [
{
"file": "examples/data/breast_hetero_guest.csv",
"partition": 16,
"table_name": "breast_hetero_guest",
"namespace": "experiment",
"role": "guest_0"
},
{
"file": "examples/data/breast_hetero_host.csv",
"partition": 16,
"table_name": "breast_hetero_host",
"namespace": "experiment",
"role": "host_0"
}
],
"default": {
"conf": "./test_hetero_pearson_default_conf.json",
"dsl": "./test_hetero_pearson_default_dsl.json"
},
"host_only": {
"conf": "./test_hetero_pearson_host_only_conf.json",
"dsl": "./test_hetero_pearson_host_only_dsl.json"
},
"sole": {
"conf": "./test_hetero_pearson_sole_conf.json",
"dsl": "./test_hetero_pearson_sole_dsl.json"
},
"mix_rand": {
"conf": "./test_hetero_pearson_mix_rand_conf.json",
"dsl": "./test_hetero_pearson_mix_rand_dsl.json"
}
}
}

test_hetero_pearson_default_dsl.json
{
"components": {
"output": {
"data": [
"data"
]
}
},
"data_transform_0": {
"module": "DataTransform",
"input": {
"data": {
"data": [
]
}
},
"output": {
"data": [
"data"
],
"model": [
"model"
]
}
},
"intersection_0": {
"module": "Intersection",
"input": {
"data": {
"data": [
"data_transform_0.data"
]
}
},
"output": {
"data": [
"data"
]
}
},
"hetero_pearson_0": {
"module": "HeteroPearson",
"input": {
"data": {
"train_data": [
"intersection_0.data"
]
}
},
"output": {
"data": [
"data"
],
"model": [
"model"
]
}
}
}
}

test_hetero_pearson_host_only_conf.json
{
"dsl_version": 2,
"initiator": {
"role": "guest",
"party_id": 9999
},
"role": {
"host": [
10000
],
"guest": [
9999
]
},
"component_parameters": {
"common": {
"hetero_pearson_0": {
"column_indexes": -1,
"cross_parties": false
}
},
"role": {
"host": {
"0": {
"table": {
"name": "breast_hetero_host",
"namespace": "experiment"
}
},
"data_transform_0": {
"with_label": false
}
}
},
"guest": {
"0": {
"hetero_pearson_0": {
"need_run": false
},
"table": {
"name": "breast_hetero_guest",
"namespace": "experiment"
}
},
"data_transform_0": {
"with_label": true,
"output_format": "dense"
}
}
}
}
}
}

test_hetero_pearson_sole_conf.json
{
"dsl_version": 2,
"initiator": {
"role": "guest",
"party_id": 9999
},
"role": {
"host": [
10000
],
"guest": [
9999
]
},
"component_parameters": {
"common": {
"hetero_pearson_0": {
"column_indexes": -1,
"cross_parties": false
}
},
"role": {
"host": {
"0": {
"table": {
"name": "breast_hetero_host",
"namespace": "experiment"
}
},
"data_transform_0": {
"with_label": false
}
}
},
"guest": {
"0": {
"table": {
"name": "breast_hetero_guest",
"namespace": "experiment"
}
},
"data_transform_0": {
"with_label": true,
"output_format": "dense"
}
}
}
}
}
}

test_hetero_pearson_default_conf.json
{
"dsl_version": 2,
"initiator": {
"role": "guest",
"party_id": 9999
},
"role": {
"host": [
10000
],
"guest": [
9999
]
},
"component_parameters": {
"common": {
"hetero_pearson_0": {
"column_indexes": -1
}
},
"role": {
"guest": {
"0": {
"data_transform_0": {
"with_label": true,
"output_format": "dense"
},
"table": {
"name": "breast_hetero_guest",
"namespace": "experiment"
}
}
}
},
"host": {
"0": {
"data_transform_0": {
"with_label": false
},