@article {2564, title = {A framework for sharing confidential research data, applied to investigating differential pay by race in the U. S. government}, year = {Submitted}, abstract = {Data stewards seeking to provide access to large-scale social science data face a difficult challenge. They have to share data in ways that protect privacy and confidentiality, are informative for many analyses and purposes, and are relatively straightforward to use by data analysts. We present a framework for addressing this challenge. The framework uses an integrated system that includes fully synthetic data intended for wide access, coupled with means for approved users to access the confidential data via secure remote access solutions, glued together by verification servers that allow users to assess the quality of their analyses with the synthetic data. We apply this framework to data on the careers of employees of the U. S. federal government, studying differentials in pay by race. The integrated system performs as intended, allowing users to explore the synthetic data for potential pay differentials and learn through verifications which findings in the synthetic data hold up in the confidential data and which do not. We find differentials across races; for example, the gap between black and white female federal employees{\textquoteright} pay increased over the time period. We present models for generating synthetic careers and differentially private algorithms for verification of regression results. }, author = {Barrientos, A. F. and Bolton, A. and Balmat, T. and Reiter, J. P. and Machanavajjhala, A. and Chen, Y. and Kneifel, C. and DeLong, M. and de Figueiredo, J. M.} } @conference {synthdiagicdm, title = {Differentially private regression diagnostics}, booktitle = {IEEE International Conference on Data Mining}, year = {2017}, abstract = {Many data producers seek to provide users access to confidential data without unduly compromising data subjects{\textquoteright} privacy and confidentiality. When intense redaction is needed to do so, one general strategy is to require users to do analyses without seeing the confidential data, for example, by releasing fully synthetic data or by allowing users to query remote systems for disclosure-protected outputs of statistical models. With fully synthetic data or redacted outputs, the analyst never really knows how much to trust the resulting findings. In particular, if the user did the same analysis on the confidential data, would regression coefficients of interest be statistically significant or not? We present algorithms for assessing this question that satisfy differential privacy. We describe conditions under which the algorithms should give accurate answers about statistical significance. We illustrate the properties of the methods using artificial and genuine data.}, author = {Chen, Y. and Machanavajjhala, A. and Reiter, J. P. and Barrientos, A.} }