Skip to content

Commit 42c1595

Browse files
Add two dimension binning and calculate binning result according to provided binning boundaries. (#88)
* Add two methods in Binning Runnable: two dimension binning and calcualte the binning result according to provided binning boundaries. * Add more logs. * Do some rephrase. * Rename binning folder to bin to avoid the name conflict between binning folder and binning.py.
1 parent 68b957a commit 42c1595

5 files changed

Lines changed: 96 additions & 7 deletions

File tree

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def calc_binning_stats(
5757
bin_methods,
5858
bin_nums,
5959
cols_bin_boundaries,
60-
reverse_cumsum=False):
60+
reverse_cumsum):
6161
cols_bin_stats = []
6262
for i in range(len(sel_cols)):
6363
sel_col = sel_cols[i]
@@ -102,11 +102,46 @@ def calc_stats(
102102
bin_methods,
103103
bin_nums,
104104
cols_bin_boundaries,
105-
reverse_cumsum=False):
105+
reverse_cumsum):
106106
basic_stats_df = calc_basic_stats(in_md, sel_cols)
107107
cols_bin_stats_df = calc_binning_stats(in_md, sel_cols, bin_methods, bin_nums, cols_bin_boundaries, reverse_cumsum)
108108

109109
stats_df = pd.merge(basic_stats_df, cols_bin_stats_df, how='inner', on='name')
110110

111111
return stats_df
112112

113+
114+
def calc_two_dim_binning_stats(
115+
in_md,
116+
sel_col_1,
117+
sel_col_2,
118+
bin_method_1,
119+
bin_method_2,
120+
bin_num_1,
121+
bin_num_2,
122+
bin_boundaries_1,
123+
bin_boundaries_2,
124+
reverse_cumsum):
125+
bin_o1, bins_1 = binning(in_md, sel_col_1, bin_method_1, bin_num_1, bin_boundaries_1)
126+
bin_o2, bins_2 = binning(in_md, sel_col_2, bin_method_2, bin_num_2, bin_boundaries_2)
127+
128+
bin_num_1 = len(bins_1) - 1
129+
bin_num_2 = len(bins_2) - 1
130+
131+
bin_o = bin_o1 * bin_num_2 + bin_o2
132+
bin_prob_df = bin_o.value_counts(normalize=True).to_pandas().to_frame()
133+
bin_prob_df = bin_prob_df.reindex(range(bin_num_1 * bin_num_2), fill_value=0)
134+
two_dim_bin_prob_np = bin_prob_df.to_numpy().reshape((bin_num_1, bin_num_2))
135+
two_dim_bin_cumsum_prob_np = cumsum(two_dim_bin_prob_np, reverse_cumsum)
136+
137+
return pd.DataFrame(two_dim_bin_prob_np), pd.DataFrame(two_dim_bin_cumsum_prob_np)
138+
139+
140+
def get_cols_bin_boundaries(stats_df):
141+
col_boundaries = {}
142+
for _, row in stats_df.iterrows():
143+
col_name = row['name']
144+
boundaries = [float(item) for item in row['bin_boundaries'].split(',')]
145+
col_boundaries[col_name] = boundaries
146+
147+
return col_boundaries

runnables/binning.py

Lines changed: 58 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import argparse
22
import mars.dataframe as md
33
import os
4-
from binning.binning import calc_stats
4+
import pandas as pd
5+
from bin.binning_calculator import calc_stats, calc_two_dim_binning_stats, get_cols_bin_boundaries
56
from run_io.db_adapter import convertDSNToRfc1738
67
from sqlalchemy import create_engine
78

@@ -12,7 +13,9 @@ def build_argument_parser():
1213
parser.add_argument("--columns", type=str, required=True)
1314
parser.add_argument("--bin_methods", type=str, required=False)
1415
parser.add_argument("--bin_nums", type=str, required=False)
16+
parser.add_argument("--bin_input_table", type=str, required=False)
1517
parser.add_argument("--reverse_cumsum", type=bool, default=False)
18+
parser.add_argument("--two_dim_bin_cols", type=str, required=False)
1619

1720
return parser
1821

@@ -23,29 +26,80 @@ def build_argument_parser():
2326
columns = args.columns.split(',')
2427
bin_methods = args.bin_methods.split(',') if args.bin_methods else None
2528
bin_nums = [int(item) for item in args.bin_nums.split(',')] if args.bin_nums else None
29+
two_dim_bin_cols = args.two_dim_bin_cols.split(',') if args.two_dim_bin_cols else None
2630

2731
select_input = os.getenv("SQLFLOW_TO_RUN_SELECT")
2832
output = os.getenv("SQLFLOW_TO_RUN_INTO")
33+
output_tables = output.split(',')
2934
datasource = os.getenv("SQLFLOW_DATASOURCE")
3035

36+
# Check arguments
37+
if two_dim_bin_cols:
38+
assert(len(two_dim_bin_cols) == 2)
39+
assert(len(output_tables) == 3)
40+
3141
url = convertDSNToRfc1738(datasource, args.dbname)
3242
engine = create_engine(url)
3343
input_md = md.read_sql(
3444
sql=select_input,
3545
con=engine)
3646
input_md.execute()
3747

48+
cols_bin_boundaries = {}
49+
if args.bin_input_table:
50+
print("Get provided bin boundaries from table {}".format(args.bin_input_table))
51+
bin_input_df = pd.read_sql_table(
52+
table_name=args.bin_input_table,
53+
con=engine)
54+
cols_bin_boundaries = get_cols_bin_boundaries(bin_input_df)
55+
56+
if set(columns) > cols_bin_boundaries.keys():
57+
raise ValueError("The provided bin boundaries contains keys: {}. But they cannot cover all the \
58+
input columns: {}".format(cols_bin_boundaries.keys(), columns))
59+
60+
print("Ignore the bin_nums and bin_methods arguments")
61+
bin_nums = [None for i in range(len(columns))]
62+
bin_methods = [None for i in range(len(columns))]
63+
64+
print("Calculate the statistics result for columns: {}".format(columns))
3865
stats_df = calc_stats(
3966
input_md,
4067
columns,
4168
bin_methods,
4269
bin_nums,
43-
{},
70+
cols_bin_boundaries,
4471
args.reverse_cumsum)
4572

46-
print("Persist the statistics result into the table {}".format(output))
73+
print("Persist the statistics result into the table {}".format(output_tables[0]))
4774
stats_df.to_sql(
48-
name=output,
75+
name=output_tables[0],
4976
con=engine,
5077
index=False
5178
)
79+
80+
if args.two_dim_bin_cols:
81+
print("Calculate two dimension binning result for columns: {}".format(columns))
82+
bin_prob_df, bin_cumsum_prob_df = calc_two_dim_binning_stats(
83+
input_md,
84+
columns[0],
85+
columns[1],
86+
bin_methods[0],
87+
bin_methods[1],
88+
bin_nums[0],
89+
bin_nums[1],
90+
cols_bin_boundaries.get(columns[0], None),
91+
cols_bin_boundaries.get(columns[1], None),
92+
args.reverse_cumsum)
93+
94+
print("Persist the binning probabilities into table {}".format(output_tables[1]))
95+
bin_prob_df.to_sql(
96+
name=output_tables[1],
97+
con=engine,
98+
index=False
99+
)
100+
print("Persist the binning accumulated probabilities into table {}".format(output_tables[2]))
101+
bin_cumsum_prob_df.to_sql(
102+
name=output_tables[2],
103+
con=engine,
104+
index=False
105+
)

runnables/psi.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import argparse
22
import os
33
import pandas as pd
4-
from binning.psi import calc_psi, get_cols_bin_probs
4+
from bin.psi_calculator import calc_psi, get_cols_bin_probs
55
from run_io.db_adapter import convertDSNToRfc1738
66
from sqlalchemy import create_engine
77

0 commit comments

Comments
 (0)