Hi all,
I'm training tensorflow privacy on APS dataset.
The code ran without error for non private mode.
But when I set flag for dpsgd == True. It shows error. I think it must be something with the vector_loss which is the only different between the 2.
The errors are listed below:
Traceback (most recent call last):
File "aps_log_reg.py", line 210, in
app.run(main)
File "/Users/rachelton/Library/Python/3.7/lib/python/site-packages/absl/app.py", line 300, in run
_run_main(main, args)
File "/Users/rachelton/Library/Python/3.7/lib/python/site-packages/absl/app.py", line 251, in _run_main
sys.exit(main(argv))
File "aps_log_reg.py", line 198, in main
model.train(input_fn, steps=step_per_epoch)
File "/Users/rachelton/Library/Python/3.7/lib/python/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 358, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/Users/rachelton/Library/Python/3.7/lib/python/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1124, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/Users/rachelton/Library/Python/3.7/lib/python/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1154, in _train_model_default
features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
File "/Users/rachelton/Library/Python/3.7/lib/python/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1112, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "aps_log_reg.py", line 145, in model_fn
global_step=global_step)
File "/Users/rachelton/Library/Python/3.7/lib/python/site-packages/tensorflow/python/training/optimizer.py", line 403, in minimize
grad_loss=grad_loss)
File "/Users/rachelton/DP/privacy/privacy/optimizers/dp_optimizer.py", line 170, in compute_gradients
_, sample_state = tf.while_loop(cond_fn, body_fn, [idx, sample_state])
File "/Users/rachelton/Library/Python/3.7/lib/python/site-packages/tensorflow/python/ops/control_flow_ops.py", line 3556, in while_loop
return_same_structure)
File "/Users/rachelton/Library/Python/3.7/lib/python/site-packages/tensorflow/python/ops/control_flow_ops.py", line 3087, in BuildLoop
pred, body, original_loop_vars, loop_vars, shape_invariants)
File "/Users/rachelton/Library/Python/3.7/lib/python/site-packages/tensorflow/python/ops/control_flow_ops.py", line 3022, in _BuildLoop
body_result = body(*packed_vars_for_body)
File "/Users/rachelton/DP/privacy/privacy/optimizers/dp_optimizer.py", line 168, in
body_fn = lambda i, state: [tf.add(i, 1), process_microbatch(i, state)] # pylint: disable=line-too-long
File "/Users/rachelton/DP/privacy/privacy/optimizers/dp_optimizer.py", line 149, in process_microbatch
sample_params, sample_state, grads_list)
File "/Users/rachelton/DP/privacy/privacy/dp_query/dp_query.py", line 159, in accumulate_record
preprocessed_record = self.preprocess_record(params, record)
File "/Users/rachelton/DP/privacy/privacy/analysis/privacy_ledger.py", line 250, in preprocess_record
return self._query.preprocess_record(params, record)
File "/Users/rachelton/DP/privacy/privacy/dp_query/normalized_query.py", line 74, in preprocess_record
return self._numerator.preprocess_record(params, record)
File "/Users/rachelton/DP/privacy/privacy/dp_query/gaussian_query.py", line 100, in preprocess_record
preprocessed_record, _ = self.preprocess_record_impl(params, record)
File "/Users/rachelton/DP/privacy/privacy/dp_query/gaussian_query.py", line 96, in preprocess_record_impl
clipped_as_list, norm = tf.clip_by_global_norm(record_as_list, l2_norm_clip)
File "/Users/rachelton/Library/Python/3.7/lib/python/site-packages/tensorflow/python/ops/clip_ops.py", line 278, in clip_by_global_norm
constant_op.constant(1.0, dtype=use_norm.dtype) / clip_norm)
File "/Users/rachelton/Library/Python/3.7/lib/python/site-packages/tensorflow/python/ops/math_ops.py", line 812, in binary_op_wrapper
return func(x, y, name=name)
File "/Users/rachelton/Library/Python/3.7/lib/python/site-packages/tensorflow/python/ops/math_ops.py", line 912, in _truediv_python3
(x_dtype, y_dtype))
TypeError: x and y must have the same dtype, got tf.float64 != tf.float32
my code:
`
from future import absolute_import
from future import division
from future import print_function
from absl import app
from absl import flags
from distutils.version import LooseVersion
from privacy.analysis import privacy_ledger
from privacy.analysis.rdp_accountant import compute_rdp_from_ledger
from privacy.analysis.rdp_accountant import get_privacy_spent
from privacy.optimizers import dp_optimizer
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import MaxAbsScaler
if LooseVersion(tf.version) < LooseVersion('2.0.0'):
GradientDescentOptimizer = tf.train.GradientDescentOptimizer
else:
GradientDescentOptimizer = tf.optimizers.SGD # pylint: disable=invalid-name
FLAGS = flags.FLAGS
flags.DEFINE_boolean(
'dpsgd', True, 'If True, train with DP-SGD. If False, '
'train with vanilla SGD.')
flags.DEFINE_float('learning_rate', 0.1, 'Learning rate for training')
flags.DEFINE_float('noise_multiplier', 1.1,
'Ratio of the standard deviation to the clipping norm')
flags.DEFINE_float('l2_norm_clip', 1.0, 'Clipping norm')
flags.DEFINE_integer('batch_size', 128, 'Batch size')
flags.DEFINE_integer('epochs', 1, 'Number of epochs')
flags.DEFINE_integer('num_steps', 1000, 'Number of steps')
flags.DEFINE_integer('num_classes', 2, 'Number of classes')
flags.DEFINE_integer('microbatches', 128, 'Number of microbatches ''(must evenly divide batch_size)')
flags.DEFINE_string('model_dir', None, 'Model directory')
class EpsilonPrintingTrainingHook(tf.estimator.SessionRunHook):
"""Training hook to print current value of epsilon after an epoch."""
def __init__(self, ledger):
"""Initalizes the EpsilonPrintingTrainingHook.
Args:
ledger: The privacy ledger.
"""
self._samples, self._queries = ledger.get_unformatted_ledger()
def end(self, session):
orders = [1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64))
samples = session.run(self._samples)
queries = session.run(self._queries)
formatted_ledger = privacy_ledger.format_ledger(samples, queries)
rdp = compute_rdp_from_ledger(formatted_ledger, orders)
eps = get_privacy_spent(orders, rdp, target_delta=1e-5)[0]
print('For delta=1e-5, the current epsilon is: %.2f' % eps)
def get_data():
df_train = pd.read_csv('data_original/aps_failure_training_set.csv')
df_test = pd.read_csv('data_original/aps_failure_test_set.csv')
#
df_train.replace('na','-1', inplace=True)
df_test.replace('na','-1', inplace=True)
# categorical for label: 0: neg, 1: pos
df_train['class'] = pd.Categorical(df_train['class']).codes
df_test['class'] = pd.Categorical(df_test['class']).codes
# split data into x and y
Y_train = df_train['class'].copy(deep=True)
X_train = df_train.copy(deep=True)
X_train.drop(['class'], inplace=True, axis=1)
Y_test = df_test['class'].copy(deep=True)
X_test = df_test.copy(deep=True)
X_test.drop(['class'], inplace=True, axis=1)
# strings to float
X_train = X_train.astype('float64')
X_test = X_test.astype('float64')
# scale the dataset
scaler = MaxAbsScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
return X_train, Y_train, X_test, Y_test
def linear_layer(x_dict):
x = x_dict['images']
out_layer = tf.keras.layers.Dense(FLAGS.num_classes).apply(x)
return out_layer
def model_fn(features, labels, mode):
logits = linear_layer(features)
# vector loss: each component of the vector correspond to an individual training point and label.
# Use for per example gradient later.
vector_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
labels=tf.cast(labels, dtype=tf.int64))
scalar_loss = tf.reduce_mean(vector_loss)
print('*******************')
print(vector_loss.dtype)
print(scalar_loss.dtype)
if mode == tf.estimator.ModeKeys.TRAIN:
if FLAGS.dpsgd:
ledger = privacy_ledger.PrivacyLedger(
population_size=60000,
selection_probability=(FLAGS.batch_size / 60000))
optimizer = dp_optimizer.DPGradientDescentGaussianOptimizer(
l2_norm_clip=FLAGS.l2_norm_clip,
noise_multiplier=FLAGS.noise_multiplier,
num_microbatches=FLAGS.microbatches,
ledger=ledger,
learning_rate=FLAGS.learning_rate)
training_hooks = [
EpsilonPrintingTrainingHook(ledger)
]
opt_loss = vector_loss
else:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=FLAGS.learning_rate)
opt_loss = scalar_loss
training_hooks = []
global_step = tf.train.get_global_step()
train_op = optimizer.minimize(loss=opt_loss,
global_step=global_step)
return tf.estimator.EstimatorSpec(mode=mode,
loss=scalar_loss,
train_op=train_op,
training_hooks=training_hooks)
elif mode == tf.estimator.ModeKeys.EVAL:
pred_classes = tf.argmax(logits, axis=1)
acc_op = tf.metrics.accuracy(labels=labels, predictions=pred_classes)
return tf.estimator.EstimatorSpec(mode=mode,
loss=scalar_loss,
eval_metric_ops={'accuracy':acc_op})
#if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode, predictions=pred_classes)
def main(unused_argv):
tf.logging.set_verbosity(tf.logging.INFO)
if FLAGS.dpsgd and FLAGS.batch_size % FLAGS.microbatches != 0:
raise ValueError('Number of microbatches should divide evenly batch_size')
# get data: train_data, train_label, test_data, test_label
x_train, y_train, x_test, y_test = get_data()
# print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
# Init estimator
# model_fn, model_dir
model = tf.estimator.Estimator(model_fn)
# define train input
input_fn = tf.estimator.inputs.numpy_input_fn(
x={'images':x_train},
y=y_train,
batch_size=FLAGS.batch_size,
num_epochs=None,
shuffle=True)
eval_input_fn = tf.estimator.inputs.numpy_input_fn(
x={'images': x_test},
y=y_test,
batch_size=FLAGS.batch_size,
shuffle=False)
step_per_epoch = 60000 // FLAGS.batch_size
# train model on train input
for epoch in range(FLAGS.epochs):
model.train(input_fn, steps=step_per_epoch)
if name == "main":
app.run(main)
`