Skip to content

Commit 2aec0a1

Browse files
authored
scorecard model (#59)
* scorecard demo * update * caculate AUC * update model name * update * update
1 parent 56c35a9 commit 2aec0a1

3 files changed

Lines changed: 139 additions & 0 deletions

File tree

sqlflow_models/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from .dnnclassifier_functional_api_example import dnnclassifier_functional_model
88
from .rnn_based_time_series import RNNBasedTimeSeriesModel
99
from .auto_estimator import AutoClassifier, AutoRegressor
10+
from .score_card import ScoreCard
1011
from .native_keras import RawDNNClassifier
1112
from .custom_model_example import CustomClassifier
1213
from .gcn import GCN

sqlflow_models/score_card.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
#!/bin/env python
2+
3+
import tensorflow as tf
4+
from tensorflow import keras
5+
from tensorflow.python.data import make_one_shot_iterator
6+
from tensorflow.keras.losses import kld
7+
from tensorflow.keras.optimizers import SGD
8+
import numpy as np
9+
import pandas as pd
10+
import scipy.stats.stats as stats
11+
import sklearn
12+
from sklearn.linear_model import LogisticRegression
13+
from sklearn.model_selection import train_test_split
14+
from sklearn.metrics import roc_auc_score, auc
15+
import pickle
16+
17+
18+
def optimizer():
19+
return None
20+
21+
22+
def loss():
23+
return None
24+
25+
26+
class ScoreCard(keras.Model):
27+
28+
def __init__(self, feature_columns=None, pf_bin_size=5):
29+
super(ScoreCard, self).__init__(name='ScoreCard')
30+
31+
self._factor = 20/np.log(2)
32+
self._offset = 600 - 20*np.log(20) / np.log(2)
33+
self._bins = dict()
34+
self._pf_bin_size = pf_bin_size
35+
36+
def _pf_bin(self, y, x):
37+
# population frequency bucket
38+
bad_num = y.sum()
39+
good_num = y.count() - y.sum()
40+
d1 = pd.DataFrame({'x': x,'y': y,'bucket': pd.qcut(x, self._pf_bin_size, duplicates='drop')})
41+
d2 = d1.groupby('bucket',as_index=True)
42+
d3 = pd.DataFrame(d2.x.min(),columns=['min_bin'])
43+
44+
d3["min"] = d2.min().x
45+
d3["max"] = d2.max().x
46+
d3["badcostum"] = d2.sum().y
47+
d3["goodcostum"] = d2.count().y - d2.sum().y
48+
d3["total"] = d2.count().y
49+
d3["bad_rate"] = d2.sum().y/d2.count().y
50+
d3["woe"] = np.log(d3["badcostum"]/d3["goodcostum"]*good_num/bad_num)
51+
iv = ((d3["badcostum"]/bad_num-d3["goodcostum"]/good_num)*d3["woe"])
52+
d3["iv"] = iv
53+
woe = list(d3["woe"].round(6))
54+
cut = list(d3["max"].round(6))
55+
cut.insert(0, float("-inf"))
56+
cut[-1] = float("inf")
57+
return d3, cut, woe, iv
58+
59+
def _to_dataframe(self, dataset):
60+
x_df = pd.DataFrame()
61+
y_df = pd.DataFrame()
62+
for _, minibatch in enumerate(dataset):
63+
data, label = minibatch
64+
dx = {}
65+
dy = {}
66+
for name, value in data.items():
67+
dx[name] = value.numpy()[0][0]
68+
x_df = x_df.append(dx, ignore_index=True)
69+
dy['label'] = label.numpy()[0]
70+
y_df = y_df.append(dy, ignore_index=True)
71+
return x_df, y_df
72+
73+
def _replace_woe(self, x, cut, woe):
74+
return pd.cut(x, cut, labels=pd.Categorical(woe))
75+
76+
def _woe_encoder(self, x, y):
77+
x_train_dict = {}
78+
for col in x.columns:
79+
dfx, cut, woe, iv = self._pf_bin(y, x[col])
80+
self._bins[col] = (dfx, cut, woe, iv)
81+
# replacing by the WOE encode
82+
x_train_dict[col] = self._replace_woe(x[col], cut, woe)
83+
return pd.DataFrame.from_dict(x_train_dict)
84+
85+
def sqlflow_train_loop(self, dataset, epochs=1, verbose=0):
86+
x_df, y_df = self._to_dataframe(dataset)
87+
x = self._woe_encoder(x_df, y_df['label'])
88+
x.to_csv("/tmp/train_woe.csv")
89+
lr = LogisticRegression()
90+
91+
x_train, x_test, y_train, y_test = train_test_split(x, y_df['label'])
92+
lr.fit(x_train, y_train)
93+
prob = lr.predict_proba(x_test)[:, 1]
94+
auc_score = roc_auc_score(y_test, prob)
95+
print("AUC: {}\n".format(auc_score))
96+
97+
# print the score card
98+
print("THE SCORE CARD:")
99+
coe = lr.coef_
100+
for i, col_name in enumerate(x_df.columns):
101+
bin_cols = self._bins[col_name][0].index.to_list()
102+
for j, w in enumerate(self._bins[col_name][2]):
103+
print(col_name, bin_cols[j],
104+
round(coe[0][i] * w * self._factor, 0))

tests/test_score_card.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
from sqlflow_models import ScoreCard
2+
import unittest
3+
import tensorflow as tf
4+
from datetime import datetime, timedelta
5+
import numpy as np
6+
7+
8+
class TestScoreCard(unittest.TestCase):
9+
def create_dataset(self):
10+
samples = 20
11+
f = [np.random.randint(20, size=1) for i in range(samples)]
12+
label = [np.random.randint(2, size=1) for i in range(samples)]
13+
14+
def generator():
15+
for i, item in enumerate(f):
16+
yield [f[i]], label[i]
17+
18+
def dict_mapper(feature, label):
19+
return {'f1': feature}, label
20+
21+
dataset = tf.data.Dataset.from_generator(
22+
generator, output_types=(tf.dtypes.float32, tf.dtypes.float32)
23+
)
24+
dataset = dataset.map(dict_mapper)
25+
return dataset
26+
27+
def test_train(self):
28+
dataset = self.create_dataset()
29+
m = ScoreCard(pf_bin_size=2)
30+
m.sqlflow_train_loop(dataset)
31+
32+
33+
if __name__ == '__main__':
34+
unittest.main()

0 commit comments

Comments
 (0)