@article {2566, title = {Data fusion for correcting measurement errors}, year = {Submitted}, abstract = {Often in surveys, key items are subject to measurement errors. Given just the data, it can be difficult to determine the distribution of this error process, and hence to obtain accurate inferences that involve the error-prone variables. In some settings, however, analysts have access to a data source on different individuals with high quality measurements of the error-prone survey items. We present a data fusion framework for leveraging this information to improve inferences in the error-prone survey. The basic idea is to posit models about the rates at which individuals make errors, coupled with models for the values reported when errors are made. This can avoid the unrealistic assumption of conditional independence typically used in data fusion. We apply the approach on the reported values of educational attainments in the American Community Survey, using the National Survey of College Graduates as the high quality data source. In doing so, we account for the informative sampling design used to select the National Survey of College Graduates. We also present a process for assessing the sensitivity of various analyses to different choices for the measurement error models. Supplemental material is available online.}, author = {J. P. Reiter and T. Schifeling and M. De Yoreo} } @article {2558, title = {Sequential identification of nonignorable missing data mechanisms}, journal = {Statistica Sinica}, year = {Submitted}, month = {01/2017}, abstract = {With nonignorable missing data, likelihood-based inference should be based on the joint distribution of the study variables and their missingness indicators. These joint models cannot be estimated from the data alone, thus requiring the analyst to impose restrictions that make the models uniquely obtainable from the distribution of the observed data. We present an approach for constructing classes of identifiable nonignorable missing data models. The main idea is to use a sequence of carefully set up identifying assumptions, whereby we specify potentially different missingness mechanisms for different blocks of variables. We show that the procedure results in models with the desirable property of being non-parametric saturated.}, keywords = {Identification, Missing not at random, Non-parametric saturated, Partial ignorability, Sensitivity analysis}, doi = {10.5705/ss.202016.0328}, author = {Mauricio Sadinle and Jerome P. Reiter} } @article {2634, title = {The Earned Income Tax Credit and Food Insecurity: Who Benefits?}, year = {forthcoming}, author = {Shaefer, H.L. and Wilson, R.} } @article {2635, title = {The Response of Consumer Spending to Changes in Gasoline Prices}, year = {forthcoming}, abstract = {This paper estimates how overall consumer spending responds to changes in gasoline prices. It uses the differential impact across consumers of the sudden, large drop in gasoline prices in 2014 for identification. This estimation strategy is implemented using comprehensive, daily transaction-level data for a large panel of individuals. The estimated marginal propensity to consume (MPC) is approximately one, a higher estimate than estimates found in less comprehensive or well-measured data. This estimate takes into account the elasticity of demand for gasoline and potential slow adjustment to changes in prices. The high MPC implies that changes in gasoline prices have large aggregate effects.}, author = {Gelman, Michael and Gorodnichenko, Yuriy and Kariv, Shachar and Koustas, Dmitri and Shapiro, Matthew D and Silverman, Daniel and Tadelis, Steven} } @article {annalsSorting, title = {Sorting Between and Within Industries: A Testable Model of Assortative Matching}, journal = {Annals of Economics and Statistics}, year = {2018}, issn = {21154430, 19683863}, author = {John M. Abowd and Francis Kramarz and Sebastien Perez-Duarte and Ian M. Schmutte} } @article {2663, title = {Adaptively-Tuned Particle Swarm Optimization with Application to Spatial Design}, journal = {Stat}, volume = {6}, year = {2017}, pages = {145{\textendash}159}, abstract = {Particle swarm optimization (PSO) algorithms are a class of heuristic optimization algorithms that are attractive for complex optimization problems. We propose using PSO to solve spatial design problems, e.g. choosing new locations to add to an existing monitoring network. Additionally, we introduce two new classes of PSO algorithms that perform well in a wide variety of circumstances, called adaptively tuned PSO and adaptively tuned bare bones PSO. To illustrate these algorithms, we apply them to a common spatial design problem: choosing new locations to add to an existing monitoring network. Specifically, we consider a network in the Houston, TX, area for monitoring ambient ozone levels, which have been linked to out-of-hospital cardiac arrest rates. Published 2017. This article has been contributed to by US Government employees and their work is in the public domain in the USA}, doi = {10.1002/sta4.142}, url = {http://onlinelibrary.wiley.com/doi/10.1002/sta4.142/abstract}, author = {Simpson, M. and Wikle, C.K. and Holan, S.H.} } @article {2561, title = {Bayesian estimation of bipartite matchings for record linkage}, journal = { Journal of the American Statistical Association}, volume = {112}, year = {2017}, month = {03/2017}, chapter = {600}, abstract = {The bipartite record linkage task consists of merging two disparate datafiles containing information on two overlapping sets of entities. This is non-trivial in the absence of unique identifiers and it is important for a wide variety of applications given that it needs to be solved whenever we have to combine information from different sources. Most statistical techniques currently used for record linkage are derived from a seminal paper by Fellegi and Sunter (1969). These techniques usually assume independence in the matching statuses of record pairs to derive estimation procedures and optimal point estimators. We argue that this independence assumption is unreasonable and instead target a bipartite matching between the two datafiles as our parameter of interest. Bayesian implementations allow us to quantify uncertainty on the matching decisions and derive a variety of point estimators using different loss functions. We propose partial Bayes estimates that allow uncertain parts of the bipartite matching to be left unresolved. We evaluate our approach to record linkage using a variety of challenging scenarios and show that it outperforms the traditional methodology. We illustrate the advantages of our methods merging two datafiles on casualties from the civil war of El Salvador.}, keywords = {Assignment problem, Bayes estimate, Data matching, Fellegi-Sunter decision rule, Mixture model, Rejection option}, doi = {10.1080/01621459.2016.1148612}, url = {http://amstat.tandfonline.com/doi/abs/10.1080/01621459.2016.1148612}, author = {Mauricio Sadinle} } @article {2490, title = {Cost-Benefit Analysis for a Quinquennial Census: The 2016 Population Census of South Africa}, journal = {Journal of Official Statistics}, volume = {33}, year = {2017}, month = {02/2017}, abstract = {The question of whether to carry out a quinquennial Census is faced by national statistical offices in increasingly many countries, including Canada, Nigeria, Ireland, Australia, and South Africa. We describe uses and limitations of cost-benefit analysis in this decision problem in the case of the 2016 Census of South Africa. The government of South Africa needed to decide whether to conduct a 2016 Census or to rely on increasingly inaccurate postcensal estimates accounting for births, deaths, and migration since the previous (2011) Census. The cost-benefit analysis compared predicted costs of the 2016 Census to the benefits of improved allocation of intergovernmental revenue, which was considered by the government to be a critical use of the 2016 Census, although not the only important benefit. Without the 2016 Census, allocations would be based on population estimates. Accuracy of the postcensal estimates was estimated from the performance of past estimates, and the hypothetical expected reduction in errors in allocation due to the 2016 Census was estimated. A loss function was introduced to quantify the improvement in allocation. With this evidence, the government was able to decide not to conduct the 2016 Census, but instead to improve data and capacity for producing post-censal estimates.}, keywords = {demographic statistics, fiscal allocations, loss function, population estimates, post-censal estimates}, isbn = { 2001-7367}, doi = {10.1515/jos-2017-0013}, url = {https://www.degruyter.com/view/j/jos.2017.33.issue-1/jos-2017-0013/jos-2017-0013.xml}, author = {Spencer, Bruce D. and May, Julian and Kenyon, Steven and Seeskin, Zachary} } @article {2507, title = {Do Interviewer Post-survey Evaluations of Respondents Measure Who Respondents Are or What They Do? A Behavior Coding Study}, journal = {Public Opinion Quarterly}, year = {2017}, month = {08/2017}, abstract = {Survey interviewers are often tasked with assessing the quality of respondents{\textquoteright} answers after completing a survey interview. These interviewer observations have been used to proxy for measurement error in interviewer-administered surveys. How interviewers formulate these evaluations and how well they proxy for measurement error has received little empirical attention. According to dual-process theories of impression formation, individuals form impressions about others based on the social categories of the observed person (e.g., sex, race) and individual behaviors observed during an interaction. Although initial impressions start with heuristic, rule-of-thumb evaluations, systematic processing is characterized by extensive incorporation of available evidence. In a survey context, if interviewers default to heuristic information processing when evaluating respondent engagement, then we expect their evaluations to be primarily based on respondent characteristics and stereotypes associated with those characteristics. Under systematic processing, on the other hand, interviewers process and evaluate respondents based on observable respondent behaviors occurring during the question-answering process. We use the Work and Leisure Today Survey, including survey data and behavior codes, to examine proxy measures of heuristic and systematic processing by interviewers as predictors of interviewer postsurvey evaluations of respondents{\textquoteright} cooperativeness, interest, friendliness, and talkativeness. Our results indicate that CATI interviewers base their evaluations on actual behaviors during an interview (i.e., systematic processing) rather than perceived characteristics of the respondent or the interviewer (i.e., heuristic processing). These results are reassuring for the many surveys that collect interviewer observations as proxies for data quality.}, doi = {10.1093/poq/nfx026}, url = {https://doi.org/10.1093/poq/nfx026}, author = {Kirchner, Antje and Olson, Kristen and Smyth, Jolene D.} } @techreport {handle:1813:52650, title = {Effects of a Government-Academic Partnership: Has the NSF-Census Bureau Research Network Helped Secure the Future of the Federal Statistical System?}, number = {1813:52650}, year = {2017}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {

Effects of a Government-Academic Partnership: Has the NSF-Census Bureau Research Network Helped Secure the Future of the Federal Statistical System? Weinberg, Daniel; Abowd, John M.; Belli, Robert F.; Cressie, Noel; Folch, David C.; Holan, Scott H.; Levenstein, Margaret C.; Olson, Kristen M.; Reiter, Jerome P.; Shapiro, Matthew D.; Smyth, Jolene; Soh, Leen-Kiat; Spencer, Bruce; Spielman, Seth E.; Vilhuber, Lars; Wikle, Christopher The National Science Foundation-Census Bureau Research Network (NCRN) was established in 2011 to create interdisciplinary research nodes on methodological questions of interest and significance to the broader research community and to the Federal Statistical System (FSS), particularly the Census Bureau. The activities to date have covered both fundamental and applied statistical research and have focused at least in part on the training of current and future generations of researchers in skills of relevance to surveys and alternative measurement of economic units, households, and persons. This paper discusses some of the key research findings of the eight nodes, organized into six topics: (1) Improving census and survey data collection methods; (2) Using alternative sources of data; (3) Protecting privacy and confidentiality by improving disclosure avoidance; (4) Using spatial and spatio-temporal statistical modeling to improve estimates; (5) Assessing data cost and quality tradeoffs; and (6) Combining information from multiple sources. It also reports on collaborations across nodes and with federal agencies, new software developed, and educational activities and outcomes. The paper concludes with an evaluation of the ability of the FSS to apply the NCRN{\textquoteright}s research outcomes and suggests some next steps, as well as the implications of this research-network model for future federal government renewal initiatives. This paper began as a May 8, 2015 presentation to the National Academies of Science{\textquoteright}s Committee on National Statistics by two of the principal investigators of the National Science Foundation-Census Bureau Research Network (NCRN) {\textendash} John Abowd and the late Steve Fienberg (Carnegie Mellon University). The authors acknowledge the contributions of the other principal investigators of the NCRN who are not co-authors of the paper (William Block, William Eddy, Alan Karr, Charles Manski, Nicholas Nagle, and Rebecca Nugent), the co- principal investigators, and the comments of Patrick Cantwell, Constance Citro, Adam Eck, Brian Harris-Kojetin, and Eloise Parker. We note with sorrow the deaths of Stephen Fienberg and Allan McCutcheon, two of the original NCRN principal investigators. The principal investigators also wish to acknowledge Cheryl Eavey{\textquoteright}s sterling grant administration on behalf of the NSF. The conclusions reached in this paper are not the responsibility of the National Science Foundation (NSF), the Census Bureau, or any of the institutions to which the authors belong

}, url = {http://hdl.handle.net/1813/52650}, author = {Weinberg, Daniel and Abowd, John M. and Belli, Robert F. and Cressie, Noel and Folch, David C. and Holan, Scott H. and Levenstein, Margaret C. and Olson, Kristen M. and Reiter, Jerome P. and Shapiro, Matthew D. and Smyth, Jolene and Soh, Leen-Kiat and Spencer, Bruce and Spielman, Seth E. and Vilhuber, Lars and Wikle, Christopher} } @techreport {handle:1813:52164, title = {Formal Privacy Models and Title 13}, number = {1813:52164}, year = {2017}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {Formal Privacy Models and Title 13 Nissim, Kobbi; Gasser, Urs; Smith, Adam; Vadhan, Salil; O{\textquoteright}Brien, David; Wood, Alexandra A new collaboration between academia and the Census Bureau to further the Bureau{\textquoteright}s use of formal privacy models.}, url = {http://hdl.handle.net/1813/52164}, author = {Nissim, Kobbi and Gasser, Urs and Smith, Adam and Vadhan, Salil and O{\textquoteright}Brien, David and Wood, Alexandra} } @article {2599, title = {Itemwise conditionally independent nonresponse modeling for incomplete multivariate data}, journal = {Biometrika }, volume = {104}, year = {2017}, month = {01/2017}, pages = {207-220}, chapter = {207}, abstract = {We introduce a nonresponse mechanism for multivariate missing data in which each study variable and its nonresponse indicator are conditionally independent given the remaining variables and their nonresponse indicators. This is a nonignorable missingness mechanism, in that nonresponse for any item can depend on values of other items that are themselves missing. We show that, under this itemwise conditionally independent nonresponse assumption, one can define and identify nonparametric saturated classes of joint multivariate models for the study variables and their missingness indicators. We also show how to perform sensitivity analysis to violations of the conditional independence assumptions encoded by this missingness mechanism. Throughout, we illustrate the use of this modeling approach with data analyses.}, keywords = {Loglinear model, Missing not at random, Missingness mechanism, Nonignorable, Nonparametric saturated, Sensitivity analysis}, doi = {10.1093/biomet/asw063}, url = {https://doi.org/10.1093/biomet/asw063}, author = {M. Sadinle and J.P. Reiter} } @article {sadinle:reiter:bmka, title = {Itemwise conditionally independent nonresponse modeling for multivariate categorical data}, journal = {Biometrika}, volume = {104}, year = {2017}, month = {01/2017}, pages = {207-220}, abstract = {With nonignorable missing data, likelihood-based inference should be based on the joint distribution of the study variables and their missingness indicators. These joint models cannot be estimated from the data alone, thus requiring the analyst to impose restrictions that make the models uniquely obtainable from the distribution of the observed data. We present an approach for constructing classes of identifiable nonignorable missing data models. The main idea is to use a sequence of carefully set up identifying assumptions, whereby we specify potentially different missingness mechanisms for different blocks of variables. We show that the procedure results in models with the desirable property of being non-parametric saturated.}, keywords = {Identification, Missing not at random, Non-parametric saturated, Partial ignorability, Sensitivity analysis}, author = {Sadinle, M. and Reiter, J. P.} } @article {doi:10.1080/07350015.2017.1356727, title = {Modeling Endogenous Mobility in Earnings Determination}, journal = {Journal of Business \& Economic Statistics}, number = {ja}, year = {2017}, pages = {0-0}, abstract = {We evaluate the bias from endogenous job mobility in fixed-effects estimates of worker- and firm-specific earnings heterogeneity using longitudinally linked employer-employee data from the LEHD infrastructure file system of the U.S. Census Bureau. First, we propose two new residual diagnostic tests of the assumption that mobility is exogenous to unmodeled determinants of earnings. Both tests reject exogenous mobility. We relax exogenous mobility by modeling the matched data as an evolving bipartite graph using a Bayesian latent-type framework. Our results suggest that allowing endogenous mobility increases the variation in earnings explained by individual heterogeneity and reduces the proportion due to employer and match effects. To assess external validity, we match our estimates of the wage components to out-of-sample estimates of revenue per worker. The mobility-bias corrected estimates attribute much more of the variation in revenue per worker to variation in match quality and worker quality than the uncorrected estimates.}, doi = {10.1080/07350015.2017.1356727}, url = {http://dx.doi.org/10.1080/07350015.2017.1356727}, author = {John M. Abowd and Kevin L. Mckinney and Ian M. Schmutte} } @techreport {2575, title = {Modeling Endogenous Mobility in Wage Determination}, year = {2017}, abstract = {We evaluate the bias from endogenous job mobility in fixed-effects estimates of worker- and firm-specific earnings heterogeneity using longitudinally linked employer-employee data from the LEHD infrastructure file system of the U.S. Census Bureau. First, we propose two new residual diagnostic tests of the assumption that mobility is exogenous to unmodeled determinants of earnings. Both tests reject exogenous mobility. We relax the exogenous mobility assumptions by modeling the evolution of the matched data as an evolving bipartite graph using a Bayesian latent class framework. Our results suggest that endogenous mobility biases estimated firm effects toward zero. To assess validity, we match our estimates of the wage components to out-of-sample estimates of revenue per worker. The corrected estimates attribute much more of the variation in revenue per worker to variation in match quality and worker quality than the uncorrected estimates.}, url = {http://digitalcommons.ilr.cornell.edu/ldi/28/}, author = {John M. Abowd and Kevin L. Mckinney and Ian M. Schmutte} } @techreport {handle:1813:52164, title = {NCRN Meeting Spring 2017: Formal Privacy Models and Title 13}, number = {1813:52164}, year = {2017}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2017: Formal Privacy Models and Title 13 Nissim, Kobbi; Gasser, Urs; Smith, Adam; Vadhan, Salil; O{\textquoteright}Brien, David; Wood, Alexandra A new collaboration between academia and the Census Bureau to further the Bureau{\textquoteright}s use of formal privacy models.}, url = {http://hdl.handle.net/1813/52164}, author = {Nissim, Kobbi and Gasser, Urs and Smith, Adam and Vadhan, Salil and O{\textquoteright}Brien, David and Wood, Alexandra} } @techreport {handle:1813:52656, title = {Presentation: Introduction to Stan for Markov Chain Monte Carlo}, number = {1813:52656}, year = {2017}, institution = {University of Missouri}, type = {Preprint}, abstract = {Presentation: Introduction to Stan for Markov Chain Monte Carlo Simpson, Matthew An introduction to Stan (http://mc-stan.org/): a probabilistic programming language that implements Hamiltonian Monte Carlo (HMC), variational Bayes, and (penalized) maximum likelihood estimation. Presentation given at the U.S. Census Bureau on April 25, 2017.}, url = {http://hdl.handle.net/1813/52656}, author = {Simpson, Matthew} } @techreport {handle:1813:46197, title = {Proceedings from the 2016 NSF{\textendash}Sloan Workshop on Practical Privacy}, number = {1813:46197}, year = {2017}, institution = {Cornell University}, type = {Preprint}, abstract = {Proceedings from the 2016 NSF{\textendash}Sloan Workshop on Practical Privacy Vilhuber, Lars; Schmutte, Ian On October 14, 2016, we hosted a workshop that brought together economists, survey statisticians, and computer scientists with expertise in the field of privacy preserving methods: Census Bureau staff working on implementing cutting-edge methods in the Bureau{\textquoteright}s flagship public-use products mingled with academic researchers from a variety of universities. The four products discussed as part of the workshop were 1. the American Community Survey (ACS); 2. Longitudinal Employer-Household Data (LEHD), in particular the LEHD Origin-Destination Employment Statistics (LODES); the 3. 2020 Decennial Census; and the 4. 2017 Economic Census. The goal of the workshop was to 1. Discuss the specific challenges that have arisen in ongoing efforts to apply formal privacy models to Census data products by drawing together expertise of academic and governmental researchers 2. Produce short written memos that summarize concrete suggestions for practical applications to specific Census Bureau priority areas.}, url = {http://hdl.handle.net/1813/46197}, author = {Vilhuber, Lars and Schmutte, Ian} } @techreport {handle:1813:52473, title = {Proceedings from the 2017 Cornell-Census- NSF- Sloan Workshop on Practical Privacy}, number = {1813:52473}, year = {2017}, institution = {Cornell University}, type = {Preprint}, abstract = {Proceedings from the 2017 Cornell-Census- NSF- Sloan Workshop on Practical Privacy Vilhuber, Lars; Schmutte, Ian M. ese proceedings report on a workshop hosted at the U.S. Census Bureau on May 8, 2017. Our purpose was to gather experts from various backgrounds together to continue discussing the development of formal privacy systems for Census Bureau data products. is workshop was a successor to a previous workshop held in October 2016 (Vilhuber \& Schmu e 2017). At our prior workshop, we hosted computer scientists, survey statisticians, and economists, all of whom were experts in data privacy. At that time we discussed the practical implementation of cu ing-edge methods for publishing data with formal, provable privacy guarantees, with a focus on applications to Census Bureau data products. e teams developing those applications were just starting out when our rst workshop took place, and we spent our time brainstorming solutions to the various problems researchers were encountering, or anticipated encountering. For these cu ing-edge formal privacy models, there had been very li le e ort in the academic literature to apply those methods in real-world se ings with large, messy data. We therefore brought together an expanded group of specialists from academia and government who could shed light on technical challenges, subject ma er challenges and address how data users might react to changes in data availability and publishing standards. In May 2017, we organized a follow-up workshop, which these proceedings report on. We reviewed progress made in four di erent areas. e four topics discussed as part of the workshop were 1. the 2020 Decennial Census; 2. the American Community Survey (ACS); 3. the 2017 Economic Census; 4. measuring the demand for privacy and for data quality. As in our earlier workshop, our goals were to 1. Discuss the speci c challenges that have arisen in ongoing e orts to apply formal privacy models to Census data products by drawing together expertise of academic and governmental researchers; 2. Produce short wri en memos that summarize concrete suggestions for practical applications to speci c Census Bureau priority areas. Comments can be provided at h ps://goo.gl/ZAh3YE}, url = {http://hdl.handle.net/1813/52473}, author = {Vilhuber, Lars and Schmutte, Ian M.} } @techreport {handle:1813:52472, title = {Proceedings from the Synthetic LBD International Seminar}, number = {1813:52472}, year = {2017}, institution = {Cornell University}, type = {Preprint}, abstract = {Proceedings from the Synthetic LBD International Seminar Vilhuber, Lars; Kinney, Saki; Schmutte, Ian M. On May 9, 2017, we hosted a seminar to discuss the conditions necessary to implement the SynLBD approach with interested parties, with the goal of providing a straightforward toolkit to implement the same procedure on other data. The proceedings summarize the discussions during the workshop.}, url = {http://hdl.handle.net/1813/52472}, author = {Vilhuber, Lars and Kinney, Saki and Schmutte, Ian M.} } @techreport {2567, title = {Revisiting the Economics of Privacy: Population Statistics and Confidentiality Protection as Public Goods}, number = {37}, year = {2017}, month = {04/2017}, abstract = {We consider the problem of determining the optimal accuracy of public statistics when increased accuracy requires a loss of privacy. To formalize this allocation problem, we use tools from statistics and computer science to model the publication technology used by a public statistical agency. We derive the demand for accurate statistics from first principles to generate interdependent preferences that account for the public-good nature of both data accuracy and privacy loss. We first show data accuracy is inefficiently under-supplied by a private provider. Solving the appropriate social planner{\textquoteright}s problem produces an implementable publication strategy. We implement the socially optimal publication plan for statistics on income and health status using data from the American Community Survey, National Health Interview Survey, Federal Statistical System Public Opinion Survey and Cornell National Social Survey. Our analysis indicates that welfare losses from providing too much privacy protection and, therefore, too little accuracy can be substantial.}, url = {http://digitalcommons.ilr.cornell.edu/ldi/37/}, author = {John M. Abowd and Ian M. Schmutte} } @techreport {handle:1813:39081, title = {Revisiting the Economics of Privacy: Population Statistics and Confidentiality Protection as Public Goods}, number = {1813:39081}, year = {2017}, institution = {Cornell University}, type = {Preprint}, abstract = {Revisiting the Economics of Privacy: Population Statistics and Confidentiality Protection as Public Goods Abowd, John; Schmutte, Ian M. We consider the problem of the public release of statistical information about a population{\textendash}explicitly accounting for the public-good properties of both data accuracy and privacy loss. We first consider the implications of adding the public-good component to recently published models of private data publication under differential privacy guarantees using a Vickery-Clark-Groves mechanism and a Lindahl mechanism. We show that data quality will be inefficiently under-supplied. Next, we develop a standard social planner{\textquoteright}s problem using the technology set implied by (ε, δ)-differential privacy with (α, β)-accuracy for the Private Multiplicative Weights query release mechanism to study the properties of optimal provision of data accuracy and privacy loss when both are public goods. Using the production possibilities frontier implied by this technology, explicitly parameterized interdependent preferences, and the social welfare function, we display properties of the solution to the social planner{\textquoteright}s problem. Our results directly quantify the optimal choice of data accuracy and privacy loss as functions of the technology and preference parameters. Some of these properties can be quantified using population statistics on marginal preferences and correlations between income, data accuracy preferences, and privacy loss preferences that are available from survey data. Our results show that government data custodians should publish more accurate statistics with weaker privacy guarantees than would occur with purely private data publishing. Our statistical results using the General Social Survey and the Cornell National Social Survey indicate that the welfare losses from under-providing data accuracy while over-providing privacy protection can be substantial. A complete archive of the data and programs used in this paper is available via http://doi.org/10.5281/zenodo.345385.}, url = {http://hdl.handle.net/1813/39081}, author = {Abowd, John and Schmutte, Ian M.} } @techreport {handle:1813:52612, title = {Revisiting the Economics of Privacy: Population Statistics and Confidentiality Protection as Public Goods}, number = {1813:52612}, year = {2017}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {Revisiting the Economics of Privacy: Population Statistics and Confidentiality Protection as Public Goods Abowd, John; Schmutte, Ian M. We consider the problem of determining the optimal accuracy of public statistics when increased accuracy requires a loss of privacy. To formalize this allocation problem, we use tools from statistics and computer science to model the publication technology used by a public statistical agency. We derive the demand for accurate statistics from first principles to generate interdependent preferences that account for the public-good nature of both data accuracy and privacy loss. We first show data accuracy is inefficiently under-supplied by a private provider. Solving the appropriate social planner{\textquoteright}s problem produces an implementable publication strategy. We implement the socially optimal publication plan for statistics on income and health status using data from the American Community Survey, National Health Interview Survey, Federal Statistical System Public Opinion Survey and Cornell National Social Survey. Our analysis indicates that welfare losses from providing too much privacy protection and, therefore, too little accuracy can be substantial.}, url = {http://hdl.handle.net/1813/52612}, author = {Abowd, John and Schmutte, Ian M.} } @booklet {2502, title = {Sequential Prediction of Respondent Behaviors Leading to Error in Web-based Surveys}, year = {2017}, author = {Eck, Adam and Soh, Leen-Kiat} } @techreport {ldi40, title = {Sorting Between and Within Industries: A Testable Model of Assortative Matching}, number = {40}, year = {2017}, institution = {Labor Dynamics Institute}, type = {Document}, abstract = {We test Shimer{\textquoteright}s (2005) theory of the sorting of workers between and within industrial sectors based on directed search with coordination frictions, deliberately maintaining its static general equilibrium framework. We fit the model to sector-specific wage, vacancy and output data, including publicly-available statistics that characterize the distribution of worker and employer wage heterogeneity across sectors. Our empirical method is general and can be applied to a broad class of assignment models. The results indicate that industries are the loci of sorting{\textendash}more productive workers are employed in more productive industries. The evidence confirms that strong assortative matching can be present even when worker and employer components of wage heterogeneity are weakly correlated.}, url = {http://digitalcommons.ilr.cornell.edu/ldi/40/}, author = {John M. Abowd and Francis Kramarz and Sebastien Perez-Duarte and Ian M. Schmutte} } @techreport {2017arXiv171002690C, title = {{Unique Entity Estimation with Application to the Syrian Conflict}}, number = {1710.02690}, year = {2017}, abstract = {Entity resolution identifies and removes duplicate entities in large, noisy databases and has grown in both usage and new developments as a result of increased data availability. Nevertheless, entity resolution has tradeoffs regarding assumptions of the data generation process, error rates, and computational scalability that make it a difficult task for real applications. In this paper, we focus on a related problem of unique entity estimation, which is the task of estimating the unique number of entities and associated standard errors in a data set with duplicate entities. Unique entity estimation shares many fundamental challenges of entity resolution, namely, that the computational cost of all-to-all entity comparisons is intractable for large databases. To circumvent this computational barrier, we propose an efficient (near-linear time) estimation algorithm based on locality sensitive hashing. Our estimator, under realistic assumptions, is unbiased and has provably low variance compared to existing random sampling based approaches. In addition, we empirically show its superiority over the state-of-the-art estimators on three real applications. The motivation for our work is to derive an accurate estimate of the documented, identifiable deaths in the ongoing Syrian conflict. Our methodology, when applied to the Syrian data set, provides an estimate of $191,874 \pm 1772$ documented, identifiable deaths, which is very close to the Human Rights Data Analysis Group (HRDAG) estimate of 191,369. Our work provides an example of challenges and efforts involved in solving a real, noisy challenging problem where modeling assumptions may not hold. }, keywords = {Computer Science - Data Structures and Algorithms, Computer Science - Databases, Statistics - Applications}, url = {https://arxiv.org/abs/1710.02690}, author = {Chen, B. and Shrivastava, A. and Steorts, R.~C.} } @article {doi:10.1080/01621459.2015.1105807, title = {A Bayesian Approach to Graphical Record Linkage and Deduplication}, journal = {Journal of the American Statistical Association}, volume = {111}, number = {516}, year = {2016}, pages = {1660-1672}, abstract = {ABSTRACTWe propose an unsupervised approach for linking records across arbitrarily many files, while simultaneously detecting duplicate records within files. Our key innovation involves the representation of the pattern of links between records as a bipartite graph, in which records are directly linked to latent true individuals, and only indirectly linked to other records. This flexible representation of the linkage structure naturally allows us to estimate the attributes of the unique observable people in the population, calculate transitive linkage probabilities across records (and represent this visually), and propagate the uncertainty of record linkage into later analyses. Our method makes it particularly easy to integrate record linkage with post-processing procedures such as logistic regression, capture{\textendash}recapture, etc. Our linkage structure lends itself to an efficient, linear-time, hybrid Markov chain Monte Carlo algorithm, which overcomes many obstacles encountered by previously record linkage approaches, despite the high-dimensional parameter space. We illustrate our method using longitudinal data from the National Long Term Care Survey and with data from the Italian Survey on Household and Wealth, where we assess the accuracy of our method and show it to be better in terms of error rates and empirical scalability than other approaches in the literature. Supplementary materials for this article are available online.}, doi = {10.1080/01621459.2015.1105807}, url = {http://dx.doi.org/10.1080/01621459.2015.1105807}, author = {Rebecca C. Steorts and Rob Hall and Stephen E. Fienberg} } @article {si:reiter:hillygus16, title = {Bayesian latent pattern mixture models for handling attrition in panel studies with refreshment samples}, journal = {Annals of Applied Statistics}, volume = {10}, year = {2016}, pages = {118-{\textendash}143}, doi = {10.1214/15-AOAS876}, url = {http://projecteuclid.org/euclid.aoas/1458909910}, author = {Y. Si and J. P. Reiter and D. S. Hillygus} } @booklet {2532, title = {Data management and analytic use of paradata: SIPP-EHC audit trails}, year = {2016}, author = {Lee, Jinyoung and Seloske, Ben and C{\'o}rdova Cazar, Ana Luc{\'\i}a and Eck, Adam and Kirchner, Antje and Belli, Robert F.} } @article {2241, title = {Differentially private publication of data on wages and job mobility}, journal = {Statistical Journal of the International Association for Official Statistics}, volume = {32}, year = {2016}, month = {02/2016/2016}, pages = {81-92}, chapter = {81}, abstract = {Brazil, like many countries, is reluctant to publish business-level data, because of legitimate concerns about the establishments{\textquoteright} confidentiality. A trusted data curator can increase the utility of data, while managing the risk to establishments, either by releasing synthetic data, or by infusing noise into published statistics. This paper evaluates the application of a differentially private mechanism to publish statistics on wages and job mobility computed from Brazilian employer-employee matched data. The publication mechanism can result in both the publication of specific statistics as well as the generation of synthetic data. I find that the tradeoff between the privacy guaranteed to individuals in the data, and the accuracy of published statistics, is potentially much better that the worst-case theoretical accuracy guarantee. However, the synthetic data fare quite poorly in analyses that are outside the set of queries to which it was trained. Note that this article only explores and characterizes the feasibility of these publication strategies, and will not directly result in the publication of any data. }, keywords = {Demand for public statistics, differential privacy, job mobility, matched employer-employee data, optimal confidentiality protection, optimal data accuracy, technology for statistical agencies}, doi = {10.3233/SJI-160962}, url = {http://content.iospress.com/articles/statistical-journal-of-the-iaos/sji962}, author = {Schmutte, Ian M.} } @article {2509, title = {Do Interviewers with High Cooperation Rates Behave Differently? Interviewer Cooperation Rates and Interview Behaviors}, journal = {Survey Practice}, volume = {9}, year = {2016}, month = {2016}, pages = {no pp.}, abstract = {Interviewers are required to be flexible in responding to respondent concerns during recruitment, but standardized during administration of the questionnaire. These skill sets may be at odds. Recent research has shown a U-shaped relationship between interviewer cooperation rates and interviewer variance: the least and the most successful interviewers during recruitment have the largest interviewer variance components. Little is known about why this association occurs. We posit four hypotheses for this association: 1) interviewers with higher cooperation rates more conscientious interviewers altogether, 2) interviewers with higher cooperation rates continue to use rapport behaviors from the cooperation request throughout an interview, 3) interviewers with higher cooperation rates display more confidence which translates into different interview behavior, and 4) interviewers with higher cooperation rates continue their flexible interviewing style throughout the interview and deviate more from standardized interviewing. We use behavior codes from the Work and Leisure Today Survey (n=450, AAPOR RR3=6.3\%) to evaluate interviewer behavior. Our results largely support the confidence hypothesis. Interviewers with higher cooperation rates do not show evidence of being {\textquotedblleft}better{\textquotedblright} interviewers.}, url = {http://www.surveypractice.org/index.php/SurveyPractice/article/view/351}, author = {Olson, Kristen and Kirchner, Antje and Smyth, Jolene D.} } @techreport {2571, title = {Estimating Compensating Wage Differentials with Endogenous Job Mobility}, year = {2016}, abstract = {We demonstrate a strategy for using matched employer-employee data to correct endogenous job mobility bias when estimating compensating wage differentials. Applied to fatality rates in the census of formal-sector jobs in Brazil between 2003-2010, we show why common approaches to eliminating ability bias can greatly amplify endogenous job mobility bias. By extending the search-theoretic hedonic wage frame- work, we establish conditions necessary to interpret our estimates as preferences. We present empirical analyses supporting the predictions of the model and identifying conditions, demonstrating that the standard models are misspecified, and that our proposed model eliminates latent ability and endogenous mobility biases.}, url = {http://digitalcommons.ilr.cornell.edu/ldi/29/}, author = {Kurt Lavetti and Ian M. Schmutte} } @article {2415, title = {How Should We Define Low-Wage Work? An Analysis Using the Current Population Survey}, journal = {Monthly Labor Review}, year = {2016}, month = {October}, abstract = {Low-wage work is a central concept in considerable research, yet it lacks an agreed-upon definition. Using data from the Current Population Survey{\textquoteright}s Annual Social and Economic Supplement, the analysis presented in this article suggests that defining low-wage work on the basis of alternative hourly wage cutoffs changes the size of the low-wage population, but does not noticeably alter time trends in the rate of change. The analysis also indicates that different definitions capture groups of workers with substantively different demographic, social, and economic characteristics. Although the individuals in any of the categories examined might reasonably be considered low-wage workers, a single definition obscures these distinctions.}, url = {http://www.bls.gov/opub/mlr/2016/article/pdf/how-should-we-define-low-wage-work.pdf}, author = {Fusaro, V. and Shaefer, H. Luke} } @article {2020, title = {Incorporating marginal prior information into latent class models}, journal = {Bayesian Analysis}, volume = {11}, year = {2016}, pages = {499-518}, doi = {doi:10.1214/15-BA959}, url = {https://projecteuclid.org/euclid.ba/1434649584}, author = {Schifeling, T. S. and Reiter, J. P.} } @article {2235, title = {Measuring Poverty Using the Supplemental Poverty Measure in the Panel Study of Income Dynamics, 1998 to 2010}, journal = {Journal of Economic and Social Measurement}, volume = {41}, year = {2016}, chapter = {17}, abstract = {The Supplemental Poverty Measure (SPM) was recently introduced by the U.S. Census Bureau as an alternative measure of poverty that addresses many shortcomings of the official poverty measure (OPM) to better reflect the resources households have available to meet their basic needs. The Census SPM is available only in the Current Population Survey (CPS). This paper describes a method for constructing SPM poverty estimates in the Panel Study of Income Dynamics (PSID), for the biennial years 1998 through 2010. A public-use dataset of individual-level SPM status produced in this analysis will be available for download on the PSID website. Annual SPM poverty estimates from the PSID are presented for the years 1998, 2000, 2002, 2004, 2006, 2008, and 2010 and compared to SPM estimates for the same years derived from CPS data by the Census Bureau and independent researchers. We find that SPM poverty rates in the PSID are somewhat lower than those found in the CPS, though trends over time and impact of specific SPM components are similar across the two datasets.}, doi = {10.3233/JEM-160425}, url = {http://content.iospress.com/articles/journal-of-economic-and-social-measurement/jem425}, author = {Kimberlin, S. and Shaefer, H.L. and Kim, J.} } @booklet {2515, title = {Mismatches}, year = {2016}, author = {Smyth, Jolene and Olson, Kristen} } @techreport {handle:1813:40306, title = {Modeling Endogenous Mobility in Earnings Determination}, number = {1813:40306}, year = {2016}, institution = {Cornell University}, type = {Preprint}, abstract = {Modeling Endogenous Mobility in Earnings Determination Abowd, John M.; McKinney, Kevin L.; Schmutte, Ian M. We evaluate the bias from endogenous job mobility in fixed-effects estimates of worker- and firm-specific earnings heterogeneity using longitudinally linked employer-employee data from the LEHD infrastructure file system of the U.S. Census Bureau. First, we propose two new residual diagnostic tests of the assumption that mobility is exogenous to unmodeled determinants of earnings. Both tests reject exogenous mobility. We relax the exogenous mobility assumptions by modeling the evolution of the matched data as an evolving bipartite graph using a Bayesian latent class framework. Our results suggest that endogenous mobility biases estimated firm effects toward zero. To assess validity, we match our estimates of the wage components to out-of-sample estimates of revenue per worker. The corrected estimates attribute much more of the variation in revenue per worker to variation in match quality and worker quality than the uncorrected estimates. Replication code can be found at DOI: http://doi.org/10.5281/zenodo.zenodo.376600 and our Github repository endogenous-mobility-replication .}, url = {http://hdl.handle.net/1813/40306}, author = {Abowd, John M. and McKinney, Kevin L. and Schmutte, Ian M.} } @techreport {handle:1813:43897, title = {NCRN Meeting Spring 2016: A 2016 View of 2020 Census Quality, Costs, Benefits}, number = {1813:43897}, year = {2016}, institution = {Northwestern University}, type = {Preprint}, abstract = {NCRN Meeting Spring 2016: A 2016 View of 2020 Census Quality, Costs, Benefits Spencer, Bruce D. Census costs affect data quality and data quality affects census benefits. Although measuring census data quality is difficult enough ex post, census planning requires it to be done well in advance. The topic of this talk is the prediction of the cost-quality curve, its uncertainty, and its relation to benefits from census data. Presented at the NCRN Meeting Spring 2016 in Washington DC on May 9-10, 2016; see http://www.ncrn.info/event/ncrn-spring-2016-meeting}, url = {http://hdl.handle.net/1813/43897}, author = {Spencer, Bruce D.} } @techreport {handle:1813:43895, title = {NCRN Meeting Spring 2016: Developing job linkages for the Health and Retirement Study}, number = {1813:43895}, year = {2016}, institution = {University of Michigan}, type = {Preprint}, abstract = {NCRN Meeting Spring 2016: Developing job linkages for the Health and Retirement Study McCue, Kristin; Abowd, John; Levenstein, Margaret; Patki, Dhiren; Rodgers, Ann; Shapiro, Matthew; Wasi, Nada This paper documents work using probabilistic record linkage to create a crosswalk between jobs reported in the Health and Retirement Study (HRS) and the list of workplaces on Census Bureau{\textquoteright}s Business Register. Matching job records provides an opportunity to join variables that occur uniquely in separate datasets, to validate responses, and to develop missing data imputation models. Identifying the respondent{\textquoteright}s workplace ({\textquotedblleft}establishment{\textquotedblright}) is valuable for HRS because it allows researchers to incorporate the effects of particular social, economic, and geospatial work environments in studies of respondent health and retirement behavior. The linkage makes use of name and address standardizing techniques tailored to business data that were recently developed in a collaboration between researchers at Census, Cornell, and the University of Michigan. The matching protocol makes no use of the identity of the HRS respondent and strictly protects the confidentiality of information about the respondent{\textquoteright}s employer. The paper first describes the clerical review process used to create a set of human-reviewed candidate pairs, and use of that set to train matching models. It then describes and compares several linking strategies that make use of employer name, address, and phone number. Finally it discusses alternative ways of incorporating information on match uncertainty into estimates based on the linked data, and illustrates their use with a preliminary sample of matched HRS jobs. Presented at the NCRN Meeting Spring 2016 in Washington DC on May 9-10, 2016; see http://www.ncrn.info/event/ncrn-spring-2016-meeting}, url = {http://hdl.handle.net/1813/43895}, author = {Mccue, Kristin and Abowd, John and Levenstein, Margaret and Patki, Dhiren and Rodgers, Ann and Shapiro, Matthew and Wasi, Nada} } @article {folch2016demography, title = {Spatial Variation in the Quality of {A}merican {C}ommunity {S}urvey Estimates}, journal = {Demography}, volume = {53}, number = {5}, year = {2016}, pages = {1535{\textendash}1554}, author = {Folch, David C. and Arribas-Bel, Daniel and Koschinsky, Julia and Spielman, Seth E.} } @mastersthesis {2484, title = {Topics on Official Statistics and Statistical Policy}, volume = {PHD}, year = {2016}, month = {09/2016}, pages = {24}, school = {Northwestern University}, address = {Evanston, Illinois }, abstract = {My dissertation studies decision questions for government statistical agencies, both regarding data collection and how to combine data from multiple sources. Informed decisions regarding expenditure on data collection require information about the effects of data quality on data use. For the first topic, I study two important uses of decennial census data in the U.S.: for apportioning the House of Representatives and for allocating federal funds. Estimates of distortions in these two uses are developed for different levels of census accuracy. Then, I thoroughly investigate the sensitivity of findings to the census error distribution and to the choice of how to measure the distortions. The chapter concludes with a proposed framework for partial cost-benefit analysis that charges a share of the cost of the census to allocation programs. Then, I investigate an approximation to make analysis of the effects of census error on allocations feasible when allocations also depend on non-census statistics, as is the case for many formula-based allocations. The approximation conditions on the realized values of the non-census statistics instead of using the joint distribution over both census and non-census statistics. The research studies how using the approximation affects conclusions. I find that in some simple cases, the approximation always either overstates or equals the true effects of census error. Understatement is possible in other cases, but theory suggests that the largest possible understatements are about one-third the amount of the largest possible overstatements. In simulations with a more complex allocation formula, the approximation tends to overstate the effects of census error with the overstatement increasing with error in non-census statistics but decreasing with error in census statistics. In the final chapter, I evaluate the use of 2008-2010 property tax data from CoreLogic, Inc. (CoreLogic), aggregated from county and township governments from around the country, to improve 2010 American Community Survey (ACS) estimates of property tax amounts for single-family homes. Particularly, I evaluate the potential to use CoreLogic to reduce respondent burden, to study survey response error and to improve adjustments for survey nonresponse. The coverage of the CoreLogic data varies between counties as does the correspondence between ACS and CoreLogic property taxes. This geographic variation implies that different approaches toward using CoreLogic are needed in different areas of the country. Further, large differences between CoreLogic and ACS property taxes in certain counties seem to be due to conceptual differences between what is collected in the two data sources. I examine three counties, Clark County, NV, Philadelphia County, PA and St. Louis County, MO, and compare how estimates would change with different approaches using the CoreLogic data. Mean county property tax estimates are highly sensitive to whether ACS or CoreLogic data are used to construct estimates. Using CoreLogic data in imputation modeling for nonresponse adjustment of ACS estimates modestly improves the predictive power of imputation models, although estimates of county property taxes and property taxes by mortgage status are not very sensitive to the imputation method.}, url = {http://search.proquest.com/docview/1826016819}, author = {Zachary Seeskin} } @article {2500, title = {Using Data Mining to Predict the Occurrence of Respondent Retrieval Strategies in Calendar Interviewing: The Quality of Retrospective Reports}, journal = {Journal of Official Statistics}, volume = {32}, year = {2016}, month = {2016}, pages = {579-600}, abstract = {Determining which verbal behaviors of interviewers and respondents are dependent on one another is a complex problem that can be facilitated via data-mining approaches. Data are derived from the interviews of 153 respondents of the Panel Study of Income Dynamics (PSID) who were interviewed about their life-course histories. Behavioral sequences of interviewer-respondent interactions that were most predictive of respondents spontaneously using parallel, timing, duration, and sequential retrieval strategies in their generation of answers were examined. We also examined which behavioral sequences were predictive of retrospective reporting data quality as shown by correspondence between calendar responses with responses collected in prior waves of the PSID. The verbal behaviors of immediately preceding interviewer and respondent turns of speech were assessed in terms of their co-occurrence with each respondent retrieval strategy. Interviewers{\textquoteright} use of parallel probes is associated with poorer data quality, whereas interviewers{\textquoteright} use of timing and duration probes, especially in tandem, is associated with better data quality. Respondents{\textquoteright} use of timing and duration strategies is also associated with better data quality and both strategies are facilitated by interviewer timing probes. Data mining alongside regression techniques is valuable to examine which interviewer-respondent interactions will benefit data quality. }, doi = {https://doi.org/10.1515/jos-2016-0030}, author = {Belli, Robert F. and Miller, L. Dee and Baghal, Tarek Al and Soh, Leen-Kiat} } @article {1866, title = {Accounting for nonignorable unit nonresponse and attrition in panel studies with refreshment samples}, journal = {Journal of Survey Statistics and Methodology}, volume = {3}, year = {2015}, pages = {265-295}, chapter = {265}, abstract = { Panel surveys typically su↵er from attrition, which can lead to biased inference when basing analysis only on cases that complete all waves of the panel. Unfortunately, panel data alone cannot inform the extent of the bias from the attrition, so that analysts using the panel data alone must make strong and untestable assumptions about the missing data mechanism. Many panel studies also include refreshment samples, which are data collected from a random sample of new individuals during some later wave of the panel. Refreshment samples o↵er information that can be utilized to correct for biases induced by nonignorable attrition while reducing reliance on strong assumptions about the attrition process. To date, these bias correction methods have not dealt with two key practical issues in panel studies: unit nonresponse in the initial wave of the panel and in the refreshment sample itself. As we illustrate, nonignorable unit nonresponse can significantly compromise the analyst{\textquoteright}s ability to use the refreshment samples for attrition bias correction. Thus, it is crucial for analysts to assess how sensitive their inferences{\textemdash}corrected for panel attrition{\textemdash}are to di↵erent assumptions about the nature of the unit nonresponse. We present an approach that facilitates such sensitivity analyses, both for suspected nonignorable unit nonresponse in the initial wave and in the refreshment sample. We illustrate the approach using simulation studies and an analysis of data from the 2007-2008 Associated Press/Yahoo News election panel study. }, doi = {10.1093/jssam/smv007}, url = {http://jssam.oxfordjournals.org/content/3/3/265.abstract}, author = {Schifeling, T. and Cheng, C. and Hillygus, D. S. and Reiter, J. P.} } @article {1739, title = {Bayesian Analysis of Spatially-Dependent Functional Responses with Spatially-Dependent Multi-Dimensional Functional Predictors}, journal = {Statistica Sinica}, volume = {25}, year = {2015}, chapter = {205-223}, doi = {10.5705/ss.2013.245w }, url = {http://www3.stat.sinica.edu.tw/preprint/SS-13-245w_Preprint.pdf}, author = {Yang, W. H. and Wikle, C.K. and Holan, S.H. and Sudduth, K. and Meyers, D.B.} } @article {2126, title = {Bayesian Latent Pattern Mixture Models for Handling Attrition in Panel Studies With Refreshment Samples}, journal = {ArXiv}, year = {2015}, month = {09/2015}, abstract = {Many panel studies collect refreshment samples---new, randomly sampled respondents who complete the questionnaire at the same time as a subsequent wave of the panel. With appropriate modeling, these samples can be leveraged to correct inferences for biases caused by non-ignorable attrition. We present such a model when the panel includes many categorical survey variables. The model relies on a Bayesian latent pattern mixture model, in which an indicator for attrition and the survey variables are modeled jointly via a latent class model. We allow the multinomial probabilities within classes to depend on the attrition indicator, which offers additional flexibility over standard applications of latent class models. We present results of simulation studies that illustrate the benefits of this flexibility. We apply the model to correct attrition bias in an analysis of data from the 2007-2008 Associated Press/Yahoo News election panel study. }, keywords = {Categorical, Dirichlet pro- cess, Multiple imputation, Non-ignorable, Panel attrition, Refreshment sample}, url = {http://arxiv.org/abs/1509.02124}, author = {Yajuan Si and Jerome P. Reiter and D. Sunshine Hillygus} } @techreport {steorts_2015_syria, title = {{Blocking Methods Applied to Casualty Records from the Syrian Conflict}}, number = {1510.07714}, year = {2015}, url = {http://arxiv.org/abs/1510.07714}, author = {Sadosky, Peter and Shrivastava, Anshumali and Price, Megan and Steorts, Rebecca} } @article {1877, title = {Capturing multivariate spatial dependence: Model, estimate, and then predict}, journal = {Statistical Science}, volume = {30}, year = {2015}, month = {06/2015}, pages = {170-175}, doi = {10.1214/15-STS517}, url = {http://projecteuclid.org/euclid.ss/1433341474}, author = {Cressie, N. and Burden, S. and Davis, W. and Krivitsky, P. and Mokhtarian, P. and Seusse, T. and Zammit-Mangion, A.} } @article {2083, title = {Comparing and selecting spatial predictors using local criteria}, journal = {Test}, volume = {24}, year = {2015}, month = {03/2015}, pages = {1-28}, chapter = {1}, issn = {1133-0686}, doi = {10.1007/s11749-014-0415-1}, url = {http://dx.doi.org/10.1007/s11749-014-0415-1}, author = {Bradley, J.R. and Cressie, N. and Shi, T.} } @techreport {1989, title = {Cost-Benefit Analysis for a Quinquennial Census: The 2016 Population Census of South Africa.}, number = {WP-15-06}, year = {2015}, institution = {Northwestern University, Institute for Policy Research}, type = {Working Paper}, abstract = {

The question of whether to carry out a quinquennial census is being faced by national statistical offices in increasingly many countries, including Canada, Nigeria, Ireland, Australia, and South Africa. The authors describe uses, and limitations, of cost-benefit analysis for this decision problem in the case of the 2016 census of South Africa. The government of South Africa needed to decide whether to conduct a 2016 census or to rely on increasingly inaccurate post-censal estimates accounting for births, deaths, and migration since the previous (2011) census. The cost-benefit analysis compared predicted costs of the 2016 census to the benefits from improved allocation of intergovernmental revenue, which was considered by the government to be a critical use of the 2016 census, although not the only important benefit. Without the 2016 census, allocations would be based on population estimates. Accuracy of the post-censal estimates was estimated from the performance of past estimates, and the hypothetical expected reduction in errors in allocation due to the 2016 census was estimated. A loss function was introduced to quantify the improvement in allocation. With this evidence, the government was able to decide not to conduct the 2016 census, but instead to improve data and capacity for producing post-censal estimates.

}, keywords = {demographic statistics, fiscal allocations, loss function, population estimates, post-censal estimates}, url = {http://www.ipr.northwestern.edu/publications/papers/2015/ipr-wp-15-06.html}, author = {Spencer, Bruce D. and May, Julian and Kenyon, Steven and Seeskin, Zachary H.} } @conference {2120, title = {Determining Potential for Breakoff in Time Diary Survey Using Paradata}, booktitle = {70th Annual Conference of the American Association for Public Opinion Research (AAPOR)}, year = {2015}, month = {05/2015}, address = {Hollywood, Florida}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Wettlaufer, D. and Arunachalam, H. and Atkin, G. and Eck, A. and Soh, L.-K. and Belli, R.F.} } @conference {2116, title = {Do Interviewers with High Cooperation Rates Behave Differently? Interviewer Cooperation Rates and Interview Behaviors}, booktitle = {International Conference on Total Survey Error}, year = {2015}, month = {09/2015}, address = {Baltimore, MD}, url = {http://www.niss.org/events/2015-international-total-survey-error-conference}, author = {Olson, K. and Smyth, J.D. and Kirchner, A.} } @conference {2115, title = {Do Interviewers with High Cooperation Rates Behave Differently? Interviewer Cooperation Rates and Interview Behaviors}, booktitle = {Joint Statistical Meetings}, year = {2015}, month = {08/2015}, address = {Seattle, WA}, url = {http://www.amstat.org/meetings/jsm/2015/program.cfm}, author = {Olson, K. and Smyth, J.D. and Kirchner, A.} } @techreport {handle:1813:40581, title = {Economic Analysis and Statistical Disclosure Limitation}, number = {1813:40581}, year = {2015}, institution = {Cornell University}, type = {Preprint}, abstract = {

Economic Analysis and Statistical Disclosure Limitation Abowd, John M.; Schmutte, Ian M. This paper explores the consequences for economic research of methods used by data publishers to protect the privacy of their respondents. We review the concept of statistical disclosure limitation for an audience of economists who may be unfamiliar with these methods. We characterize what it means for statistical disclosure limitation to be ignorable. When it is not ignorable, we consider the effects of statistical disclosure limitation for a variety of research designs common in applied economic research. Because statistical agencies do not always report the methods they use to protect confidentiality, we also characterize settings in which statistical disclosure limitation methods are discoverable; that is, they can be learned from the released data. We conclude with advice for researchers, journal editors, and statistical agencies.

}, url = {http://hdl.handle.net/1813/40581}, author = {Abowd, John M. and Schmutte, Ian M.} } @article {2057, title = {Economic Analysis and Statistical Disclosure Limitation}, journal = {Brookings Papers on Economic Activity}, volume = {Spring 2015}, year = {2015}, month = {03/2015}, abstract = {Economic Analysis and Statistical Disclosure Limitation Abowd, John M.; Schmutte, Ian M. This paper explores the consequences for economic research of methods used by data publishers to protect the privacy of their respondents. We review the concept of statistical disclosure limitation for an audience of economists who may be unfamiliar with these methods. We characterize what it means for statistical disclosure limitation to be ignorable. When it is not ignorable, we consider the effects of statistical disclosure limitation for a variety of research designs common in applied economic research. Because statistical agencies do not always report the methods they use to protect confidentiality, we also characterize settings in which statistical disclosure limitation methods are discoverable; that is, they can be learned from the released data. We conclude with advice for researchers, journal editors, and statistical agencies.}, issn = {00072303}, url = {http://www.brookings.edu/about/projects/bpea/papers/2015/economic-analysis-statistical-disclosure-limitation}, author = {Abowd, John M. and Schmutte, Ian M.} } @article {2100, title = {The Effect of CATI Questionnaire Design Features on Response Timing}, journal = {Journal of Survey Statistics and Methodology}, volume = {3}, year = {2015}, pages = {361-396}, doi = {10.1093/jssam/smv021}, author = {Olson, K. and Smyth, J.D.} } @techreport {1990, title = {Effects of Census Accuracy on Apportionment of Congress and Allocations of Federal Funds.}, number = {WP-15-05}, year = {2015}, institution = {Northwestern University, Institute for Policy Research}, type = {Working Paper}, abstract = {

How much accuracy is needed in the 2020 census depends on the cost of attaining accuracy and on the consequences of imperfect accuracy. The cost target for the 2020 census of the United States has been specified, and the Census Bureau is developing projections of the accuracy attainable for that cost. It is desirable to have information about the consequences of the accuracy that might be attainable for that cost or for alternative cost levels. To assess the consequences of imperfect census accuracy, Seeskin and Spencer consider alternative profiles of accuracy for states and assess their implications for apportionment of the U.S. House of Representatives and for allocation of federal funds. An error in allocation is defined as the difference between the allocation computed under imperfect data and the allocation computed with perfect data. Estimates of expected sums of absolute values of errors are presented for House apportionment and for federal funds allocations.

}, url = {http://www.ipr.northwestern.edu/publications/papers/2015/ipr-wp-15-05.html}, author = {Seeskin, Zachary H. and Spencer, Bruce D.} } @article {steorts2015, title = {Entity Resolution with Empirically Motivated Priors}, journal = {Bayesian Anal.}, volume = {10}, year = {2015}, month = {12}, pages = {849{\textendash}875}, abstract = {Databases often contain corrupted, degraded, and noisy data with duplicate entries across and within each database. Such problems arise in citations, medical databases, genetics, human rights databases, and a variety of other applied settings. The target of statistical inference can be viewed as an unsupervised problem of determining the edges of a bipartite graph that links the observed records to unobserved latent entities. Bayesian approaches provide attractive benefits, naturally providing uncertainty quantification via posterior probabilities. We propose a novel record linkage approach based on empirical Bayesian principles. Specifically, the empirical Bayesian-type step consists of taking the empirical distribution function of the data as the prior for the latent entities. This approach improves on the earlier HB approach not only by avoiding the prior specification problem but also by allowing both categorical and string-valued variables. Our extension to string-valued variables also involves the proposal of a new probabilistic mechanism by which observed record values for string fields can deviate from the values of their associated latent entities. Categorical fields that deviate from their corresponding true value are simply drawn from the empirical distribution function. We apply our proposed methodology to a simulated data set of German names and an Italian household survey on income and wealth, showing our method performs favorably compared to several standard methods in the literature. We also consider the robustness of our methods to changes in the hyper-parameters.}, doi = {10.1214/15-BA965SI}, url = {http://dx.doi.org/10.1214/15-BA965SI}, author = {Steorts, Rebecca C.} } @article {2198, title = {Entity resolution with empirically motivated priors}, journal = {Bayesian Analysis}, volume = {10}, year = {2015}, abstract = {Databases often contain corrupted, degraded, and noisy data with duplicate entries across and within each database. Such problems arise in citations, medical databases, genetics, human rights databases, and a variety of other applied settings. The target of statistical inference can be viewed as an unsupervised problem of determining the edges of a bipartite graph that links the observed records to unobserved latent entities. Bayesian approaches provide attractive benefits, naturally providing uncertainty quantification via posterior probabilities. We propose a novel record linkage approach based on empirical Bayesian principles. Specifically, the empirical Bayesian--type step consists of taking the empirical distribution function of the data as the prior for the latent entities. This approach improves on the earlier HB approach not only by avoiding the prior specification problem but also by allowing both categorical and string-valued variables. Our extension to string-valued variables also involves the proposal of a new probabilistic mechanism by which observed record values for string fields can deviate from the values of their associated latent entities. Categorical fields that deviate from their corresponding true value are simply drawn from the empirical distribution function. We apply our proposed methodology to a simulated data set of German names and an Italian household survey, showing our method performs favorably compared to several standard methods in the literature. We also consider the robustness of our methods to changes in the hyper-parameters.}, doi = {10.1214/15-BA965SI}, url = {http://projecteuclid.org/euclid.ba/1441790411}, author = {Steorts, Rebecca C.} } @article {1824, title = {Expanding the Discourse on Antipoverty Policy: Reconsidering a Negative Income Tax}, journal = {Journal of Poverty}, volume = {19}, year = {2015}, month = {02/2015}, pages = {218-238}, abstract = {This article proposes that advocates for the poor consider the replacement of the current means-tested safety net in the United States with a Negative Income Tax (NIT), a guaranteed income program that lifts families{\textquoteright} incomes above a minimum threshold. The article highlights gaps in service provision that leave millions in poverty, explains how a NIT could help fill those gaps, and compares current expenditures on major means-tested programs to estimated expenditures necessary for a NIT. Finally, it addresses the financial and political concerns that are likely to arise in the event that a NIT proposal gains traction among policy makers.}, keywords = {economic well-being, poverty alleviation, public policy, social welfare policy}, doi = {10.1080/10875549.2014.991889}, url = {http://dx.doi.org/10.1080/10875549.2014.991889}, author = {Jessica Wiederspan and Elizabeth Rhodes and H. Luke Shaefer} } @techreport {gelman2015individuals, title = {How individuals smooth spending: Evidence from the 2013 government shutdown using account data}, year = {2015}, institution = {National Bureau of Economic Research}, abstract = {Using comprehensive account records, this paper examines how individuals adjusted spending and saving in response to a temporary drop in income due to the 2013 U.S. government shutdown. The shutdown cut paychecks by 40\% for affected employees, which was recovered within 2 weeks. Though the shock was short-lived and completely reversed, spending dropped sharply implying a na{\"\i}ve estimate of the marginal propensity to spend of 0.58. This estimate overstates how consumption responded. While many individuals had low liquidity, they used multiple strategies to smooth consumption including delay of recurring payments such as mortgages and credit card balances.}, author = {Gelman, Michael and Kariv, Shachar and Shapiro, Matthew D and Silverman, Dan and Tadelis, Steven} } @conference {2103, title = {I Know What You Did Next: Predicting Respondent{\textquoteright}s Next Activity Using Machine Learning}, booktitle = {70th Annual Conference of the American Association for Public Opinion Research (AAPOR)}, year = {2015}, month = {May 14-17, 2015}, address = {Hollywood, Florida}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Arunachalam, H. and Atkin, G. and Eck, A. and Wettlaufer, D. and Soh, L.-K. and Belli, R.F.} } @techreport {handle:1813:40169, title = {Introduction to The Survey of Income and Program Participation (SIPP)}, number = {1813:40169}, year = {2015}, institution = {University of Michigan}, type = {Preprint}, abstract = {Introduction to The Survey of Income and Program Participation (SIPP) Shaefer, H. Luke Goals for the SIPP Workshop Provide you with an introduction to the SIPP and get you up and running on the public-use SIPP files, offer some advanced tools for 2008 Panel SIPP data analysis, Get you some experience analyzing SIPP data, Introduce you to the SIPP EHC (SIPP Redesign), Introduce you to the SIPP Synthetic Beta (SSB) Presentation made on May 15, 2015 at the Census Bureau, and previously in 2014 at Duke University and University of Michigan}, url = {http://hdl.handle.net/1813/40169}, author = {Shaefer, H. Luke} } @techreport {handle:1813:40306, title = {Modeling Endogenous Mobility in Wage Determination}, number = {1813:40306}, year = {2015}, institution = {Cornell University}, type = {Preprint}, abstract = {Modeling Endogenous Mobility in Wage Determination Abowd, John M.; McKinney, Kevin L.; Schmutte, Ian M. We evaluate the bias from endogenous job mobility in fixed-effects estimates of worker- and firm-specific earnings heterogeneity using longitudinally linked employer-employee data from the LEHD infrastructure file system of the U.S. Census Bureau. First, we propose two new residual diagnostic tests of the assumption that mobility is exogenous to unmodeled determinants of earnings. Both tests reject exogenous mobility. We relax the exogenous mobility assumptions by modeling the evolution of the matched data as an evolving bipartite graph using a Bayesian latent class framework. Our results suggest that endogenous mobility biases estimated firm effects toward zero. To assess validity, we match our estimates of the wage components to out-of-sample estimates of revenue per worker. The corrected estimates attribute much more of the variation in revenue per worker to variation in match quality and worker quality than the uncorrected estimates.}, url = {http://hdl.handle.net/1813/40306}, author = {Abowd, John M. and McKinney, Kevin L. and Schmutte, Ian M.} } @techreport {handle:1813:52608, title = {Modeling Endogenous Mobility in Wage Determination}, number = {1813:52608}, year = {2015}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {Modeling Endogenous Mobility in Wage Determination Abowd, John M.; McKinney, Kevin L.; Schmutte, Ian M. We evaluate the bias from endogenous job mobility in fixed-effects estimates of worker- and firm-specific earnings heterogeneity using longitudinally linked employer-employee data from the LEHD infrastructure file system of the U.S. Census Bureau. First, we propose two new residual diagnostic tests of the assumption that mobility is exogenous to unmodeled determinants of earnings. Both tests reject exogenous mobility. We relax exogenous mobility by modeling the matched data as an evolving bipartite graph using a Bayesian latent-type framework. Our results suggest that allowing endogenous mobility increases the variation in earnings explained by individual heterogeneity and reduces the proportion due to employer and match effects. To assess external validity, we match our estimates of the wage components to out-ofsample estimates of revenue per worker. The mobility-bias corrected estimates attribute much more of the variation in revenue per worker to variation in match quality and worker quality than the uncorrected estimates.}, url = {http://hdl.handle.net/1813/52608}, author = {Abowd, John M. and McKinney, Kevin L. and Schmutte, Ian M.} } @article {2019, title = {Multiple imputation for harmonizing longitudinal non-commensurate measures in individual participant data meta-analysis}, journal = {Statistics in Medicine}, year = {2015}, doi = {10.1002/sim.6562}, url = {http://onlinelibrary.wiley.com/doi/10.1002/sim.6562/abstract}, author = {Siddique, J. and Reiter, J. P. and Brincks, A. and Gibbons, R. and Crespi, C. and Brown, C. H.} } @techreport {handle:1813:40183, title = {NCRN Meeting Spring 2015: Geography and Usability of the American Community Survey}, number = {1813:40183}, year = {2015}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2015: Geography and Usability of the American Community Survey Spielman, Seth Presentation at the NCRN Meeting Spring 2015}, url = {http://hdl.handle.net/1813/40183}, author = {Spielman, Seth} } @techreport {handle:1813:40184, title = {NCRN Meeting Spring 2015: Revisiting the Economics of Privacy: Population Statistics and Confidentiality Protection as Public Goods}, number = {1813:40184}, year = {2015}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2015: Revisiting the Economics of Privacy: Population Statistics and Confidentiality Protection as Public Goods Abowd, John M.; Schmutte, Ian Presentation at the NCRN Meeting Spring 2015}, url = {http://hdl.handle.net/1813/40184}, author = {Abowd, John M. and Schmutte, Ian} } @conference {2118, title = {Predicting Breakoff Using Sequential Machine Learning Methods}, booktitle = {70th Annual Conference of the American Association for Public Opinion Research (AAPOR)}, year = {2015}, month = {05/2015}, address = {Hollywood, Florida}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Soh, L.-K. and Eck, A. and McCutcheon, A.L.} } @mastersthesis {Shrivastava2015, title = {Probabilistic Hashing Techniques For Big Data}, volume = {Ph.D. }, year = {2015}, school = {Cornell University}, type = {Dissertation}, abstract = {We investigate probabilistic hashing techniques for addressing computational and memory challenges in large scale machine learning and data mining systems. In this thesis, we show that the traditional idea of hashing goes far beyond near-neighbor search and there are some striking new possibilities. We show that hashing can improve state of the art large scale learning algorithms, and it goes beyond the conventional notions of pairwise similarities. Despite being a very well studied topic in literature, we found several opportunities for fundamentally improving some of the well know textbook hashing algorithms. In particular, we show that the traditional way of computing minwise hashes is unnecessarily expensive and without loosing anything we can achieve an order of magnitude speedup. We also found that for cosine similarity search there is a better scheme than SimHash. In the end, we show that the existing locality sensitive hashing framework itself is very restrictive, and we cannot have efficient algorithms for some important measures like inner products which are ubiquitous in machine learning. We propose asymmetric locality sensitive hashing (ALSH), an extended framework, where we show provable and practical efficient algorithms for Maximum Inner Product Search (MIPS). Having such an efficient solutions to MIPS directly scales up many popular machine learning algorithms. We believe that this thesis provides significant improvements to some of the heavily used subroutines in big-data systems, which we hope will be adopted.}, url = {https://ecommons.cornell.edu/handle/1813/40886}, author = {Anshumali Shrivastava} } @mastersthesis {2268, title = {Ranking Firms Using Revealed Preference and Other Essays About Labor Markets}, year = {2015}, school = {University of Michigan}, type = {Ph.D.}, address = {Ann Arbor, MI}, abstract = {This dissertation contains essays on three questions about the labor market. Chapter 1 considers the question: why do some firms pay so much and some so little? Firms account for a substantial portion of earnings inequality. Although the standard explanation is that there are search frictions that support an equilibrium with rents, this chapter finds that compensating differentials for nonpecuniary characteristics are at least as important. To reach this finding, this chapter develops a structural search model and estimates it on U.S. administrative data. The model analyzes the revealed preference information in the labor market: specifically, how workers move between the 1.5 million firms in the data. With on the order of 1.5 million parameters, standard estimation approaches are infeasible and so the chapter develops a new estimation approach that is feasible on such big data. Chapter 2 considers the question: why do men and women work at different firms? Men work for higher-paying firms than women. The chapter builds on chapter 1 to consider two explanations for why men and women work in different firms. First, men and women might search from different offer distributions. Second, men and women might have different rankings of firms. Estimation finds that the main explanation for why men and women are sorted is that women search from a lower-paying offer distribution than men. Indeed, men and women are estimated to have quite similar rankings of firms. Chapter 3 considers the question: what are there long-run effects of the minimum wage? An empirical consensus suggests that there are small employment effects of minimum wage increases. This chapter argues that these are short-run elasticities. Long-run elasticities, which may differ from short-run elasticities, are more policy relevant. This chapter develops a dynamic industry equilibrium model of labor demand. The model makes two points. First, long-run regressions have been misinterpreted because even if the short- and long-run employment elasticities differ, standard methods would not detect a difference using U.S. variation. Second, the model offers a reconciliation of the small estimated short-run employment effects with the commonly found pass-through of minimum wage increases to product prices.}, keywords = {economics, labor markets}, url = {http://hdl.handle.net/2027.42/116747}, author = {Isaac Sorkin} } @conference {2117, title = {Recording What the Respondent Says: Does Question Format Matter?}, booktitle = {70th Annual Conference of the American Association for Public Opinion Research (AAPOR)}, year = {2015}, address = {Hollywood, Florida}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Smyth, J.D. and Olson, K.} } @article {spielman2015plos, title = {Reducing the Margins of Error in the American Community Survey Through Data-Driven Regionalization}, journal = {PlosOne}, year = {2015}, month = {02/2015}, doi = {10.1371/journal.pone.0115626}, url = {http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0115626}, author = {Folch, D. and Spielman, S. E.} } @article {2084, title = {Rejoinder on: Comparing and selecting spatial predictors using local criteria}, journal = {Test}, volume = {24}, year = {2015}, month = {03/2015}, pages = {54-60}, issn = {1133-0686}, doi = {10.1007/s11749-014-0414-2}, url = {http://dx.doi.org/10.1007/s11749-014-0414-2}, author = {Bradley, J.R. and Cressie, N. and Shi, T.} } @article {2085, title = {The SAR model for very large datasets: A reduced-rank approach}, journal = {Econometrics}, volume = {3}, year = {2015}, pages = {317-338}, issn = {2225-1146}, doi = {10.3390/econometrics3020317}, url = {http://www.mdpi.com/2225-1146/3/2/317}, author = {Burden, S. and Cressie, N. and Steel, D.G.} } @article {1575, title = {Semi-parametric selection models for potentially non-ignorable attrition in panel studies with refreshment samples}, journal = {Political Analysis}, volume = {23}, year = {2015}, pages = {92-112}, chapter = {92}, url = {http://pan.oxfordjournals.org/cgi/reprint/mpu009?\%20ijkey=joX8eSl6gyIlQKP\&keytype=ref}, author = {Y. Si and J.P. Reiter and D.S. Hillygus} } @article {doi:10.1080/00045608.2015.1052335, title = {Studying Neighborhoods Using Uncertain Data from the American Community Survey: A Contextual Approach}, journal = {Annals of the Association of American Geographers}, volume = {105}, number = {5}, year = {2015}, pages = {1003-1025}, abstract = {In 2010 the American Community Survey (ACS) replaced the long form of the decennial census as the sole national source of demographic and economic data for small geographic areas such as census tracts. These small area estimates suffer from large margins of error, however, which makes the data difficult to use for many purposes. The value of a large and comprehensive survey like the ACS is that it provides a richly detailed, multivariate, composite picture of small areas. This article argues that one solution to the problem of large margins of error in the ACS is to shift from a variable-based mode of inquiry to one that emphasizes a composite multivariate picture of census tracts. Because the margin of error in a single ACS estimate, like household income, is assumed to be a symmetrically distributed random variable, positive and negative errors are equally likely. Because the variable-specific estimates are largely independent from each other, when looking at a large collection of variables these random errors average to zero. This means that although single variables can be methodologically problematic at the census tract scale, a large collection of such variables provides utility as a contextual descriptor of the place(s) under investigation. This idea is demonstrated by developing a geodemographic typology of all U.S. census tracts. The typology is firmly rooted in the social scientific literature and is organized around a framework of concepts, domains, and measures. The typology is validated using public domain data from the City of Chicago and the U.S. Federal Election Commission. The typology, as well as the data and methods used to create it, is open source and published freely online. }, doi = {10.1080/00045608.2015.1052335}, url = {http://dx.doi.org/10.1080/00045608.2015.1052335}, author = {Seth E. Spielman and Alex Singleton} } @article {2419, title = {Understanding the Dynamics of $2-a-Day Poverty in the United States}, journal = {The Russell Sage Foundation Journal of the Social Sciences}, volume = {1}, year = {2015}, author = {Shaefer, H. Luke and Edin, Kathryn and Talbert, E.} } @article {2206, title = {Understanding the Human Condition through Survey Informatics}, journal = {IEEE Computer}, volume = {48}, year = {2015}, pages = {112-116}, issn = {0018-9162}, doi = {10.1109/MC.2015.327}, author = {Eck, A. and Leen-Kiat, S. and McCutcheon, A. L. and Smyth, J.D. and Belli, R.F.} } @conference {2105, title = {Using Data Mining to Examine Interviewer-Respondent Interactions in Calendar Interviews}, booktitle = {70th Annual Conference of the American Association for Public Opinion Research (AAPOR)}, year = {2015}, month = {05/2015}, address = {Hollywood, Florida}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Belli, R.F. and Miller, L.D. and Soh, L.-K. and T. Al Baghal} } @conference {2104, title = {Using Machine Learning Techniques to Predict Respondent Type from A Priori Demographic Information}, booktitle = {70th Annual Conference of the American Association for Public Opinion Research (AAPOR)}, year = {2015}, month = {May 14-17, 2015}, address = {Hollywood, Florida}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Atkin, G. and Arunachalam, H. and Eck, A. and Wettlaufer, D. and Soh, L.-K. and Belli, R.F.} } @conference {2114, title = {Why Do Interviewers Speed Up? An Examination of Changes in Interviewer Behaviors over the Course of the Survey Field Period}, booktitle = {70th Annual Conference of the American Association for Public Opinion Research (AAPOR)}, year = {2015}, address = {Hollywood, Florida}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Olson, K. and Smyth, J.D.} } @article {spielman2014causes, title = {Causes and Patterns of Uncertainty in the American Community Survey}, journal = {Applied Geography}, volume = {46}, year = {2014}, pages = {147-157}, doi = {DOI: 10.1016/j.apgeog.2013.11.002 http://dx.doi.org/10.1016/j.apgeog.2013.11.002}, url = {http://www.sciencedirect.com/science/article/pii/S0143622813002518}, author = {Spielman, S. E. and Folch, D. and Nagle, N.} } @article {spielman2014coevolution, title = {The Co-Evolution of Residential Segregation and the Built Environment at the Turn of the 20th Century: a Schelling Model}, journal = {Transactions in GIS}, volume = {18}, number = {1}, year = {2014}, pages = {25-45}, doi = {DOI: 10.1111/tgis.12014}, url = {http://onlinelibrary.wiley.com/enhanced/doi/10.1111/tgis.12014/}, author = {Spielman, S. E. and Harrison, P.} } @inbook {ste:ven:sad:2014, title = {A Comparison of Blocking Methods for Record Linkage}, booktitle = {Privacy in Statistical Databases}, volume = {8744}, year = {2014}, pages = {253{\textendash}268}, publisher = {Springer}, organization = {Springer}, doi = {10.1007/978-3-319-11257-2_20}, url = {http://link.springer.com/chapter/10.1007/978-3-319-11257-2_20}, author = {Steorts, R. and Ventura, S. and Sadinle, M. and Fienberg, S. E. and Domingo-Ferrer, J.} } @article {1743, title = {A Comparison of Spatial Predictors when Datasets Could be Very Large}, journal = {ArXiv}, year = {2014}, abstract = {

In this article, we review and compare a number of methods of spatial prediction. To demonstrate the breadth of available choices, we consider both traditional and more-recently-introduced spatial predictors. Specifically, in our exposition we review: traditional stationary kriging, smoothing splines, negative-exponential distance-weighting, Fixed Rank Kriging, modified predictive processes, a stochastic partial differential equation approach, and lattice kriging. This comparison is meant to provide a service to practitioners wishing to decide between spatial predictors. Hence, we provide technical material for the unfamiliar, which includes the definition and motivation for each (deterministic and stochastic) spatial predictor. We use a benchmark dataset of\ CO2\ data from NASA{\textquoteright}s AIRS instrument to address computational efficiencies that include CPU time and memory usage. Furthermore, the predictive performance of each spatial predictor is assessed empirically using a hold-out subset of the AIRS data.

}, keywords = {Statistics - Methodology}, url = {http://arxiv.org/abs/1410.7748}, author = {Bradley, J.~R. and Cressie, N. and Shi, T.} } @article {nagle2014dasymetric, title = {Dasymetric Modeling and Uncertainty}, journal = {The Annals of the Association of American Geographers}, volume = {104}, number = {1}, year = {2014}, pages = {80-95}, doi = {DOI: 10.1080/00045608.2013.843439}, url = {http://www.tandfonline.com/doi/abs/10.1080/00045608.2013.843439}, author = {Nagle, N. and Buttenfield, B. and Leyk, S. and Spielman, S. E.} } @conference {2136, title = {Designing an Intelligent Time Diary Instrument: Visualization, Dynamic Feedback, and Error Prevention and Mitigation}, booktitle = {UNL/SRAM/Gallup Symposium}, year = {2014}, address = {Omaha, NE}, url = {http://grc.unl.edu/unlsramgallup-symposium}, author = {Atkin, G. and Arunachalam, H. and Eck, A. and Soh, L.-K. and Belli, R.F.} } @conference {2135, title = {Designing an Intelligent Time Diary Instrument: Visualization, Dynamic Feedback, and Error Prevention and Mitigation}, booktitle = {American Association for Public Opinion Research 2014 Annual Conference}, year = {2014}, address = {Anaheim, CA. }, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Atkin, G. and Arunachalam, H. and Eck, A. and Soh, L.-K. and Belli, R.} } @article {sad:2014, title = {Detecting Duplicates in a Homicide Registry Using a Bayesian Partitioning Approach}, journal = {Annals of Applied Statistics}, volume = {8}, number = {4}, year = {2014}, pages = {2404{\textendash}2434}, author = {Sadinle, M.} } @conference {2159, title = {The Effect of CATI Questionnaire Design Features on Response Timing}, booktitle = {American Association for Public Opinion Research 2014 Annual Conference}, year = {2014}, address = {Anaheim, CA}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Olson, K. and Smyth, Jolene} } @article {2014arXiv1409.0643S, title = {{Entity Resolution with Empirically Motivated Priors}}, journal = {ArXiv }, number = {1409.0643}, year = {2014}, abstract = {Databases often contain corrupted, degraded, and noisy data with duplicate entries across and within each database. Such problems arise in citations, medical databases, genetics, human rights databases, and a variety of other applied settings. The target of statistical inference can be viewed as an unsupervised problem of determining the edges of a bipartite graph that links the observed records to unobserved latent entities. Bayesian approaches provide attractive benefits, naturally providing uncertainty quantification via posterior probabilities. We propose a novel record linkage approach based on empirical Bayesian principles. Specifically, the empirical Bayesian--type step consists of taking the empirical distribution function of the data as the prior for the latent entities. This approach improves on the earlier HB approach not only by avoiding the prior specification problem but also by allowing both categorical and string-valued variables. Our extension to string-valued variables also involves the proposal of a new probabilistic mechanism by which observed record values for string fields can deviate from the values of their associated latent entities. Categorical fields that deviate from their corresponding true value are simply drawn from the empirical distribution function. We apply our proposed methodology to a simulated data set of German names and an Italian household survey, showing our method performs favorably compared to several standard methods in the literature. We also consider the robustness of our methods to changes in the hyper-parameters.}, keywords = {Statistics - Methodology}, url = {http://arxiv.org/abs/1409.0643}, author = {Steorts, R.~C.} } @article {1800, title = {Harnessing Naturally Occurring Data to Measure the Response of Spending to Income}, journal = {Science}, volume = {345}, year = {2014}, chapter = {212-215}, abstract = {This paper presents a new data infrastructure for measuring economic activity. The infrastructure records transactions and account balances, yielding measurements with scope and accuracy that have little precedent in economics. The data are drawn from a diverse population that overrepresents males and younger adults but contains large numbers of underrepresented groups. The data infrastructure permits evaluation of a benchmark theory in economics that predicts that individuals should use a combination of cash management, saving, and borrowing to make the timing of income irrelevant for the timing of spending. As in previous studies and in contrast to the predictions of the theory, there is a response of spending to the arrival of anticipated income. The data also show, however, that this apparent excess sensitivity of spending results largely from the coincident timing of regular income and regular spending. The remaining excess sensitivity is concentrated among individuals with less liquidity. Link to data at Berkeley Econometrics Lab (EML): https://eml.berkeley.edu/cgi-bin/HarnessingDataScience2014.cgi}, doi = {10.1126/science.1247727}, url = {http://www.sciencemag.org/content/345/6193/212.full}, author = {Gelman, M. and Kariv, S. and Shapiro, M.D. and Silverman, D. and Tadelis, S.} } @conference {2165, title = {Having a Lasting Impact: The Effects of Interviewer Errors on Data Quality}, booktitle = {Midwest Association for Public Opinion Research Annual Conference}, year = {2014}, address = {Chicago, IL}, url = {http://www.mapor.org/conferences.html}, author = {Timm, A. and Olson, K. and Smyth, J.D.} } @conference {2142, title = {Hours or Minutes: Does One Unit Fit All?}, booktitle = {Midwest Association for Public Opinion Research Annual Conference}, year = {2014}, address = {Chicago, IL}, url = {http://www.mapor.org/conferences.html}, author = {Cochran, B. and Smyth, J.D.} } @article {pee:acq:sha:2014, title = {I Cheated, but only a Little{\textendash}Partial Confessions to Unethical Behavior}, journal = {Journal of Personality and Social Psychology}, volume = {106}, number = {2}, year = {2014}, pages = {202{\textendash}217}, author = {Peer, E. and Acquisti, A. and Shalvi, S.} } @article {folch2014identifying, title = {Identifying Regions based on Flexible User Defined Constraints}, journal = {International Journal of Geographic Information Science}, volume = {28}, number = {1}, year = {2014}, pages = {164-184}, doi = {10.1080/13658816.2013.848986}, url = {http://www.tandfonline.com/doi/abs/10.1080/13658816.2013.848986}, author = {Folch, D. and Spielman, S. E.} } @conference {2149, title = {Making sense of paradata: Challenges faced and lessons learned}, booktitle = {American Association for Public Opinion Research 2014 Annual Conference}, year = {2014}, address = {Anaheim, CA}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Eck, A. and Stuart, L. and Atkin, G. and Soh, L-K and McCutcheon, A.L. and Belli, R.F.} } @conference {2148, title = {Making Sense of Paradata: Challenges Faced and Lessons Learned}, booktitle = {UNL/SRAM/Gallup Symposium}, year = {2014}, address = {Omaha, NE}, url = {http://grc.unl.edu/unlsramgallup-symposium}, author = {Eck, A. and Stuart, L. and Atkin, G. and Soh, L-K and McCutcheon, A.L. and Belli, R.F.} } @techreport {handle:1813:37748, title = {NCRN Meeting Fall 2014: Constrained Smoothed Bayesian Estimation}, number = {1813:37748}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Fall 2014: Constrained Smoothed Bayesian Estimation Steorts, Rebecca; Shalizi, Cosma Presentation from NCRN Fall 2014 meeting}, url = {http://hdl.handle.net/1813/37748}, author = {Steorts, Rebecca and Shalizi, Cosma} } @techreport {handle:1813:37411, title = {NCRN Meeting Fall 2014: Decomposing Medical-Care Expenditure Growth}, number = {1813:37411}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Fall 2014: Decomposing Medical-Care Expenditure Growth Dunn, Abe; Liebman, Eli; Shapiro, Adam}, url = {http://hdl.handle.net/1813/37411}, author = {Dunn, Abe and Liebman, Eli and Shapiro, Adam} } @techreport {handle:1813:37747, title = {NCRN Meeting Fall 2014: Designer Census Geographies}, number = {1813:37747}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Fall 2014: Designer Census Geographies Spielman, Seth Presentation from NCRN Fall 2014 meeting}, url = {http://hdl.handle.net/1813/37747}, author = {Spielman, Seth} } @techreport {handle:1813:37414, title = {NCRN Meeting Fall 2014: Respondent-Driven Sampling Estimation and the National HIV Behavioral Surveillance System}, number = {1813:37414}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Fall 2014: Respondent-Driven Sampling Estimation and the National HIV Behavioral Surveillance System Spiller, Michael (Trey)}, url = {http://hdl.handle.net/1813/37414}, author = {Spiller, Michael (Trey)} } @techreport {handle:1813:40828, title = {A New Method for Protecting Interrelated Time Series with Bayesian Prior Distributions and Synthetic Data}, number = {1813:40828}, year = {2014}, institution = {Cornell University}, type = {Preprint}, abstract = {A New Method for Protecting Interrelated Time Series with Bayesian Prior Distributions and Synthetic Data Schneider, Matthew J.; Abowd, John M. Organizations disseminate statistical summaries of administrative data via the Web for unrestricted public use. They balance the trade-off between confidentiality protection and inference quality. Recent developments in disclosure avoidance techniques include the incorporation of synthetic data, which capture the essential features of underlying data by releasing altered data generated from a posterior predictive distribution. The United States Census Bureau collects millions of interrelated time series micro-data that are hierarchical and contain many zeros and suppressions. Rule-based disclosure avoidance techniques often require the suppression of count data for small magnitudes and the modification of data based on a small number of entities. Motivated by this problem, we use zero-inflated extensions of Bayesian Generalized Linear Mixed Models (BGLMM) with privacy-preserving prior distributions to develop methods for protecting and releasing synthetic data from time series about thousands of small groups of entities without suppression based on the of magnitudes or number of entities. We find that as the prior distributions of the variance components in the BGLMM become more precise toward zero, confidentiality protection increases and inference quality deteriorates. We evaluate our methodology using a strict privacy measure, empirical differential privacy, and a newly defined risk measure, Probability of Range Identification (PoRI), which directly measures attribute disclosure risk. We illustrate our results with the U.S. Census Bureau{\textquoteright}s Quarterly Workforce Indicators.}, url = {http://hdl.handle.net/1813/40828}, author = {Schneider, Matthew J. and Abowd, John M.} } @article {singleton2014geodem, title = {The Past, Present, and Future of Geodemographic Research in the Unites States and United Kingdom}, journal = {The Professional Geographer}, volume = {4}, year = {2014}, author = {Singleton, A. and Spielman, S. E.} } @techreport {handle:1813:38121, title = {Reducing Uncertainty in the American Community Survey through Data-Driven Regionalization}, number = {1813:38121}, year = {2014}, institution = {University of Colorado at Boulder / University of Tennessee}, type = {Preprint}, abstract = {Reducing Uncertainty in the American Community Survey through Data-Driven Regionalization Spielman, Seth; Folch, David The American Community Survey (ACS) is the largest US survey of households and is the principal source for neighborhood scale information about the US population and economy. The ACS is used to allocate billions in federal spending and is a critical input to social scientific research in the US. However, estimates from the ACS can be highly unreliable. For example, in over 72\% of census tracts the estimated number of children under 5 in poverty has a margin of error greater than the estimate. Uncertainty of this magnitude complicates the use of social data in policy making, research, and governance. This article develops a spatial optimization algorithm that is capable of reducing the margins of error in survey data via the creation of new composite geographies, a process called regionalization. Regionalization is a complex combinatorial problem. Here rather than focusing on the technical aspects of regionalization we demonstrate how to use a purpose built open source regionalization algorithm to post-process survey data in order to reduce the margins of error to some user-specified threshold.}, url = {http://hdl.handle.net/1813/38121}, author = {Spielman, Seth and Folch, David} } @conference {ste:hal:fie:2014, title = {SMERED: A Bayesian Approach to Graphical Record Linkage and De-duplication}, booktitle = {AISTATS 2014 Proceedings, JMLR}, volume = {33}, year = {2014}, pages = {922{\textendash}930}, publisher = {W\& CP}, organization = {W\& CP}, author = {Steorts, R. and Hall, R. and Fienberg, S. E.} } @techreport {handle:1813:52607, title = {Sorting Between and Within Industries: A Testable Model of Assortative Matching}, number = {1813:52607}, year = {2014}, institution = {Cornell University}, type = {Preprint}, abstract = {Sorting Between and Within Industries: A Testable Model of Assortative Matching Abowd, John M.; Kramarz, Francis; Perez-Duarte, Sebastien; Schmutte, Ian M. We test Shimer{\textquoteright}s (2005) theory of the sorting of workers between and within industrial sectors based on directed search with coordination frictions, deliberately maintaining its static general equilibrium framework. We fit the model to sector-specific wage, vacancy and output data, including publicly-available statistics that characterize the distribution of worker and employer wage heterogeneity across sectors. Our empirical method is general and can be applied to a broad class of assignment models. The results indicate that industries are the loci of sorting{\textendash}more productive workers are employed in more productive industries. The evidence confirms that strong assortative matching can be present even when worker and employer components of wage heterogeneity are weakly correlated.}, url = {http://hdl.handle.net/1813/52607}, author = {Abowd, John M. and Kramarz, Francis and Perez-Duarte, Sebastien and Schmutte, Ian M.} } @article {spielman2014spatial, title = {Spatial Collective Intelligence? Accuracy, Credibility in Crowdsourced Data}, journal = {Cartography and Geographic Information Science}, volume = {41}, number = {2}, year = {2014}, pages = {115-124}, doi = {http://dx.doi.org/10.1080/15230406.2013.874200}, url = {http://go.galegroup.com/ps/i.do?action=interpret\&id=GALE|A361943563\&v=2.1\&u=nysl_sc_cornl\&it=r\&p=AONE\&sw=w\&authCount=1}, author = {Spielman, S. E.} } @conference {griffin2014supporting2, title = {Supporting Planners{\textquoteright} Work with Uncertain Demographic Data}, booktitle = {GIScience Workshop on Uncertainty Visualization}, volume = {23}, year = {2014}, url = {http://cognitivegiscience.psu.edu/uncertainty2014/papers/griffin_demographic.pdf.}, author = {Griffin, A. L. and Spielman, S. E. and Jurjevich, J. and Merrick, M. and Nagle, N. N. and Folch, D. C.} } @conference {griffin2014supporting, title = {Supporting Planners{\textquoteright} work with Uncertain Demographic Data}, booktitle = {Proceedings of IEEE VIS 2014}, year = {2014}, pages = {9{\textendash}14}, publisher = {Proceedings of IEEE VIS 2014}, organization = {Proceedings of IEEE VIS 2014}, url = {http://cognitivegiscience.psu.edu/uncertainty2014/papers/griffin_demographic.pdf}, author = {Griffin, A. L. and Spielman, S. E. and Nagle, N. N. and Jurjevich, J. and Merrick, M. and Folch, D. C.} } @conference {2147, title = {Survey Informatics: Ideas, Opportunities, and Discussions}, booktitle = {UNL/SRAM/Gallup Symposium}, year = {2014}, address = {Omaha, NE}, url = {http://grc.unl.edu/unlsramgallup-symposium}, author = {Eck, A. and Soh, L-K} } @techreport {handle:1813:38122, title = {Uncertain Uncertainty: Spatial Variation in the Quality of American Community Survey Estimates}, number = {1813:38122}, year = {2014}, institution = {University of Colorado at Boulder / University of Tennessee}, type = {Preprint}, abstract = {Uncertain Uncertainty: Spatial Variation in the Quality of American Community Survey Estimates Folch, David C.; Arribas-Bel, Daniel; Koschinsky, Julia; Spielman, Seth E. The U.S. Census Bureau{\textquoteright}s American Community Survey (ACS) is the foundation of social science research, much federal resource allocation and the development of public policy and private sector decisions. However, the high uncertainty associated with some of the ACS{\textquoteright}s most frequently used estimates can jeopardize the accuracy of inferences based on these data. While there is high level understanding in the research community that problems exist in the data, the sources and implications of these problems have been largely overlooked. Using 2006-2010 ACS median household income at the census tract scale as the test case (where a third of small-area estimates have higher than recommend errors), we explore the patterns in the uncertainty of ACS data. We consider various potential sources of uncertainty in the data, ranging from response level to geographic location to characteristics of the place. We find that there exist systematic patterns in the uncertainty in both the spatial and attribute dimensions. Using a regression framework, we identify the factors that are most frequently correlated with the error at national, regional and metropolitan area scales, and find these correlates are not consistent across the various locations tested. The implication is that data quality varies in different places, making cross-sectional analysis both within and across regions less reliable. We also present general advice for data users and potential solutions to the challenges identified.}, url = {http://hdl.handle.net/1813/38122}, author = {Folch, David C. and Arribas-Bel, Daniel and Koschinsky, Julia and Spielman, Seth E.} } @article {kimberlin2014updated, title = {An updated method for calculating income and payroll taxes from PSID data using the NBER{\textquoteright}s TAXSIM, for PSID survey years 1999 through 2011}, journal = {Unpublished manuscript, University of Michigan. Accessed May}, volume = {6}, year = {2014}, pages = {2016}, abstract = {This paper describes a method to calculate income and payroll taxes from Panel Study of Income Dynamics data using the NBER's Internet TAXSIM version 9 (http://users.nber.org/~taxsim/taxsim9/), for PSID survey years 1999, 2001, 2003, 2005. 2007, 2009, and 2011 (tax years n-1). These methods are implemented in two Stata programs, designed to be used with the PSID public-use zipped Main Interview data files: PSID_TAXSIM_1of2.do and PSID_TAXSIM_2of2.do. The main program (2of2) was written by Sara Kimberlin (skimberlin@berkeley.edu) and generates all TAXSIM input variables, runs TAXSIM, adjusts tax estimates using additional information available in PSID data, and calculates total PSID family unit taxes. A separate program (1of2) was written by Jiyoon (June) Kim (junekim@umich.edu) in collaboration with Luke Shaefer (lshaefer@umich.edu) to calculate mortgage interest for itemized deductions; this program needs to be run first, before the main program. Jonathan Latner contributed code to use the programs with the PSID zipped data. The overall methods build on the strategy for using TAXSIM with PSID data outlined by Butrica \& Burkhauser (1997), with some expansions and modifications. Note that the methods described below are designed to prioritize accuracy of income taxes calculated for low-income households, particularly refundable tax credits such as the Earned Income Tax Credit (EITC) and the Additional Child Tax Credit. Income tax liability is generally low for low-income households, and the amount of refundable tax credits is often substantially larger than tax liabilities for this population. Payroll tax can also be substantial for low-income households. Thus the methods below focus on maximizing accuracy of income tax and payroll tax calculations for low-income families, with less attention to tax items that largely impact higher-income households (e.g. the treatment of capital gains).}, author = {Kimberlin, Sara and Kim, Jiyoun and Shaefer, Luke} } @techreport {2410, title = {Using Social Media to Measure Labor Market Flows}, year = {2014}, type = {Mimeo}, url = {http://www-personal.umich.edu/~shapiro/papers/LaborFlowsSocialMedia.pdf}, author = {Antenucci, Dolan and Cafarella, Michael J and Levenstein, Margaret C. and R{\'e}, Christopher and Shapiro, Matthew} } @conference {woo:pih:acq:2014, title = {Would a Privacy Fundamentalist Sell their DNA for \$1000... if Nothing Bad Happened Thereafter? A Study of the Western Categories, Behavior Intentions, and Consequences}, booktitle = {Proceedings of the Tenth Symposium on Usable Privacy and Security (SOUPS)}, year = {2014}, note = {IAPP SOUPS Privacy Award Winner}, publisher = {ACM}, organization = {ACM}, address = {New York, NY}, url = {https://www.usenix.org/conference/soups2014/proceedings/presentation/woodruff}, author = {Woodruff, A. and Pihur, V. and Acquisti, A. and Consolvo, S. and Schmidt, L. and Brandimarte, L.} } @techreport {2653, title = {A Bayesian Approach to Graphical Record Linkage and De-duplication}, number = {1312.4645}, year = {2013}, abstract = {We propose an unsupervised approach for linking records across arbitrarily many files, while simultaneously detecting duplicate records within files. Our key innovation involves the representation of the pattern of links between records as a bipartite graph, in which records are directly linked to latent true individuals, and only indirectly linked to other records. This flexible representation of the linkage structure naturally allows us to estimate the attributes of the unique observable people in the population, calculate transitive linkage probabilities across records (and represent this visually), and propagate the uncertainty of record linkage into later analyses. Our method makes it particularly easy to integrate record linkage with post-processing procedures such as logistic regression, capture{\textendash}recapture, etc. Our linkage structure lends itself to an efficient, linear-time, hybrid Markov chain Monte Carlo algorithm, which overcomes many obstacles encountered by previously record linkage approaches, despite the high-dimensional parameter space. We illustrate our method using longitudinal data from the National Long Term Care Survey and with data from the Italian Survey on Household and Wealth, where we assess the accuracy of our method and show it to be better in terms of error rates and empirical scalability than other approaches in the literature. Supplementary materials for this article are available online.}, url = {https://arxiv.org/abs/1312.4645}, author = {Steorts, Rebecca C. and Hall, Rob and Fienberg, Stephen E.} } @techreport {handle:1813:37986, title = {b-Bit Minwise Hashing in Practice}, number = {1813:37986}, year = {2013}, institution = {Cornell University}, type = {Preprint}, abstract = {b-Bit Minwise Hashing in Practice Li, Ping; Shrivastava, Anshumali; K{\"o}nig, Arnd Christian Minwise hashing is a standard technique in the context of search for approximating set similarities. The recent work [26, 32] demon- strated a potential use of b-bit minwise hashing [23, 24] for ef- ficient search and learning on massive, high-dimensional, binary data (which are typical for many applications in Web search and text mining). In this paper, we focus on a number of critical is- sues which must be addressed before one can apply b-bit minwise hashing to the volumes of data often used industrial applications.}, url = {http://hdl.handle.net/1813/37986}, author = {Li, Ping and Shrivastava, Anshumali and K{\"o}nig, Arnd Christian} } @conference {PingShrivastava2013, title = {b-Bit Minwise Hashing in Practice}, booktitle = {Internetware{\textquoteright}13}, year = {2013}, month = {October}, abstract = {Minwise hashing is a standard technique in the context of search for approximating set similarities. The recent work [26, 32] demonstrated a potential use of b-bit minwise hashing [23, 24] for efficient search and learning on massive, high-dimensional, binary data (which are typical for many applications in Web search and text mining). In this paper, we focus on a number of critical issues which must be addressed before one can apply b-bit minwise hashing to the volumes of data often used industrial applications. Minwise hashing requires an expensive preprocessing step that computes k (e.g., 500) minimal values after applying the corresponding permutations for each data vector. We developed a parallelization scheme using GPUs and observed that the preprocessing time can be reduced by a factor of 20 ~ 80 and becomes substantially smaller than the data loading time. Reducing the preprocessing time is highly beneficial in practice, e.g., for duplicate Web page detection (where minwise hashing is a major step in the crawling pipeline) or for increasing the testing speed of online classifiers. Another critical issue is that for very large data sets it becomes impossible to store a (fully) random permutation matrix, due to its space requirements. Our paper is the first study to demonstrate that b-bit minwise hashing implemented using simple hash functions, e.g., the 2-universal (2U) and 4-universal (4U) hash families, can produce very similar learning results as using fully random permutations. Experiments on datasets of up to 200GB are presented.}, url = {http://www.nudt.edu.cn/internetware2013/}, author = {Ping Li and Anshumali Shrivastava and K{\"o}nig, Arnd Christian} } @conference {ShrivastavaLi2013a, title = {Beyond Pairwise: Provably Fast Algorithms for Approximate K-Way Similarity Search}, booktitle = {Neural Information Processing Systems (NIPS)}, year = {2013}, author = {Anshumali Shrivastava and Ping Li} } @conference {Spielman2013, title = {The Co-Evolution of Residential Segregation and the Built Environment at the Turn of the 20th Century: A Schelling Model}, booktitle = {Transactions in GIS}, year = {2013}, doi = {10.1111/tgis.12014}, author = {S.E. Spielman and Patrick Harrison} } @article {Shaefer2013, title = {Do single mothers in the United States use the Earned Income Tax Credit to reduce unsecured debt?}, journal = {Review of Economics of the Household}, number = {11}, year = {2013}, note = {NCRN}, pages = {659{\textendash}680}, type = {Journal Article}, abstract = {

The Earned Income Tax Credit (EITC) is a refundable credit for low income workers mainly targeted at families with children. This study uses the Survey of Income and Program Participation{\textquoteright}s topical modules on Assets and Liabilities to examine associations between the EITC expansions during the early 1990s and the unsecured debt of the households of single mothers. We use two difference-in-differences comparisons over the study period 1988{\textendash}1999, first comparing single mothers to single childless women, and then comparing single mothers with two or more children to single mothers with exactly one child. In both cases we find that the EITC expansions are associated with a relative decline in the unsecured debt of affected households of single mothers. While not direct evidence of a causal relationship, this is suggestive evidence that single mothers may have used part of their EITC to limit the growth of their unsecured debt during this period.

}, keywords = {Earned Income Tax Credit Single Mothers Unsecured Debt}, author = {Shaefer, H. Luke and Song, Xiaoqing and Williams Shanks, Trina R.} } @article {RebeccaC.Steorts2013, title = {On estimation of mean squared errors of benchmarked and empirical bayes estimators}, journal = {Statistica Sinica}, volume = {23}, year = {2013}, pages = {749{\textendash}767}, author = {Rebecca C. Steorts and Malay Ghosh} } @conference {2163, title = {Examining the relationship between error and behavior in the American Time Use Survey using audit trail paradata}, booktitle = {American Association for Public Opinion Research 2013 Annual Conference}, year = {2013}, address = {Boston, MA}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Ruther, N. and T. Al Baghal and A. Eck and L. Stuart and L. Phillips and R. Belli and Soh, L-K} } @techreport {handle:1813:37987, title = {Fast Near Neighbor Search in High-Dimensional Binary Data}, number = {1813:37987}, year = {2013}, institution = {Cornell University}, type = {Preprint}, abstract = {Fast Near Neighbor Search in High-Dimensional Binary Data Shrivastava, Anshumali; Li, Ping Numerous applications in search, databases, machine learning, and computer vision, can benefit from efficient algorithms for near neighbor search. This paper proposes a simple framework for fast near neighbor search in high-dimensional binary data, which are common in practice (e.g., text). We develop a very simple and effective strategy for sub-linear time near neighbor search, by creating hash tables directly using the bits generated by b-bit minwise hashing. The advantages of our method are demonstrated through thorough comparisons with two strong baselines: spectral hashing and sign (1-bit) random projections.}, url = {http://hdl.handle.net/1813/37987}, author = {Shrivastava, Anshumali and Li, Ping} } @article {wan:leo:chen:2013, title = {From Facebook Regrets to Facebook Privacy Nudges}, journal = {Ohio State Law Journal}, year = {2013}, note = {Invited paper}, author = {Wang, Y. and Leon, P. G. and Chen, X. and Komanduri, S. and Norcie, G. and Scott, K. and Acquisti, A. and Cranor, L. F. and Sadeh, N.} } @article {sad:fie:2013, title = {A Generalized Fellegi-Sunter Framework for Multiple Record Linkage with Application to Homicide Record Systems}, journal = {Journal of the American Statistical Association}, volume = {108}, number = {502}, year = {2013}, pages = {385{\textendash}397}, doi = {10.1080/01621459.2012.757231}, url = {http://dx.doi.org/10.1080/01621459.2012.757231}, author = {Sadinle, M. and Fienberg, S. E.} } @article {deng2013, title = {Handling Attrition in Longitudinal Studies: The Case for Refreshment Samples}, journal = {Statist. Sci.}, volume = {28}, year = {2013}, month = {05/2013}, pages = {238{\textendash}256}, chapter = {238}, abstract = {Panel studies typically suffer from attrition, which reduces sample size and can result in biased inferences. It is impossible to know whether or not the attrition causes bias from the observed panel data alone. Refreshment samples{\textemdash}new, randomly sampled respondents given the questionnaire at the same time as a subsequent wave of the panel{\textemdash}offer information that can be used to diagnose and adjust for bias due to attrition. We review and bolster the case for the use of refreshment samples in panel studies. We include examples of both a fully Bayesian approach for analyzing the concatenated panel and refreshment data, and a multiple imputation approach for analyzing only the original panel. For the latter, we document a positive bias in the usual multiple imputation variance estimator. We present models appropriate for three waves and two refreshment samples, including nonterminal attrition. We illustrate the three-wave analysis using the 2007{\textendash}2008 Associated Press{\textendash}Yahoo! News Election Poll.}, doi = {10.1214/13-STS414}, url = {http://dx.doi.org/10.1214/13-STS414}, author = {Deng, Yiting and Hillygus, D. Sunshine and Reiter, Jerome P. and Si, Yajuan and Zheng, Siyu} } @article {Sengupta2013, title = {Hierarchical Statistical Modeling of Big Spatial Datasets Using the Exponential Family of Distributions}, journal = {Spatial Statistics}, volume = {4}, year = {2013}, pages = {14-44}, keywords = {EM algorithm, Empirical Bayes, Geostatistical process, Maximum likelihood estimation, MCMC, SRE model}, doi = {10.1016/j.spasta.2013.02.002}, url = {http://www.sciencedirect.com/science/article/pii/S2211675313000055}, author = {Sengupta, A. and Cressie, N.} } @article {Spielman2013a, title = {Identifying Neighborhoods Using High Resolution Population Data}, journal = {Annals of the Association of American Geographers}, volume = {103}, year = {2013}, pages = {67-84}, author = {S.E. Spielman and J. Logan} } @article {spielman2013EPB, title = {Neighborhood contexts, health, and behavior: understanding the role of scale and residential sorting}, journal = {Environment and Planning B}, volume = {3}, year = {2013}, author = {Spielman, S. E. and Linkletter, C. and Yoo, E.-H.} } @article {Si2013, title = {Nonparametric Bayesian multiple imputation for incomplete categorical variables in large-scale assessment surveys}, journal = {Journal of Educational and Behavioral Statistics}, volume = {38}, year = {2013}, pages = {499-521}, url = {http://www.stat.duke.edu/~jerry/Papers/StatinMed14.pdf}, author = {Si, Y. and Reiter, J.P.} } @conference {2141, title = {Predicting the occurrence of respondent retrieval strategies in calendar interviewing: The quality of autobiographical recall in surveys}, booktitle = {Biennial conference of the Society for Applied Research in Memory and Cognition}, year = {2013}, address = {Rotterdam, Netherlands}, url = {http://static1.squarespace.com/static/504170d6e4b0b97fe5a59760/t/52457a8be4b0012b7a5f462a/1380285067247/SARMAC_X_PaperJune27.pdf}, author = {Belli, R.F. and Miller, L.D. and Soh, L-K and T. Al Baghal} } @conference {2140, title = {Predicting the occurrence of respondent retrieval strategies in calendar interviewing: The quality of retrospective reports}, booktitle = {American Association for Public Opinion Research 2013 Annual Conference}, year = {2013}, address = {Boston, MA}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Belli, R.F. and Miller, L.D. and Soh, L-K and T. Al Baghal} } @techreport {2413, title = {Reconsidering the Consequences of Worker Displacements: Survey versus Administrative Measurements}, year = {2013}, institution = {University of Michigan}, type = {mimeo}, abstract = {Displaced workers suffer persistent earnings losses. This stark finding has been established by following workers in administrative data after mass layoffs under the presumption that these are involuntary job losses owing to economic distress. Using linked survey and administrative data, this paper examines this presumption by matching worker-supplied reasons for separations with what is happening at the firm. The paper documents substantially different earnings dynamics in mass layoffs depending on the reason the worker gives for the separation. Using a new methodology for accounting for the increase in the probability of separation among all types of survey response during in a mass layoff, the paper finds earnings loss estimates that are surprisingly close to those using only administrative data. Finally, the survey-administrative link allows the decomposition of earnings losses due to subsequent nonemployment into non-participation and unemployment. Including the zero earnings of those identified as being unemployed substantially increases the estimate of earnings losses.}, url = {http://www-personal.umich.edu/~shapiro/papers/ReconsideringDisplacements.pdf}, author = {Flaaen, Aaron and Shapiro, Matthew and Isaac Sorkin} } @article {2262, title = {Ringtail: Feature Selection for Easier Nowcasting.}, journal = {WebDB}, year = {2013}, pages = {49-54}, chapter = {49}, abstract = {In recent years, social media {\textquotedblleft}nowcasting{\textquotedblright}{\textemdash}the use of on- line user activity to predict various ongoing real-world social phenomena{\textemdash}has become a popular research topic; yet, this popularity has not led to widespread actual practice. We be- lieve a major obstacle to widespread adoption is the feature selection problem. Typical nowcasting systems require the user to choose a set of relevant social media objects, which is difficult, time-consuming, and can imply a statistical back- ground that users may not have. We propose Ringtail, which helps the user choose rele- vant social media signals. It takes a single user input string (e.g., unemployment) and yields a number of relevant signals the user can use to build a nowcasting model. We evaluate Ringtail on six different topics using a corpus of almost 6 billion tweets, showing that features chosen by Ringtail in a wholly-automated way are better or as good as those from a human and substantially better if Ringtail receives some human assistance. In all cases, Ringtail reduces the burden on the user.}, url = {http://www.cs.stanford.edu/people/chrismre/papers/webdb_ringtail.pdf}, author = {Antenucci, Dolan and Cafarella, Michael J and Levenstein, Margaret C. and R{\'e}, Christopher and Shapiro, Matthew} } @article {2259, title = {Rising extreme poverty in the United States and the response of means-tested transfers.}, journal = {Social Service Review }, volume = {87}, year = {2013}, month = {06/2013}, pages = {250-268}, chapter = {250}, abstract = {This study documents an increase in the prevalence of extreme poverty among US households with children between 1996 and 2011 and assesses the response of major federal means-tested transfer programs. Extreme poverty is defined using a World Bank metric of global poverty: \$2 or less, per person, per day. Using the 1996{\textendash}2008 panels of the Survey of Income and Program Participation (SIPP), we estimate that in mid-2011, 1.65 million households with 3.55 million children were living in extreme poverty in a given month, based on cash income, constituting 4.3 percent of all nonelderly households with children. The prevalence of extreme poverty has risen sharply since 1996, particularly among those most affected by the 1996 welfare reform. Adding SNAP benefits to household income reduces the number of extremely poor households with children by 48.0 percent in mid-2011. Adding SNAP, refundable tax credits, and housing subsidies reduces it by 62.8 percent.}, doi = {10.1086/671012}, url = {http://www.jstor.org/stable/10.1086/671012}, author = {H. Luke Shaefer and Edin, K.} } @article {1559, title = {Two-stage Bayesian benchmarking as applied to small area estimation}, journal = {TEST}, volume = {22}, year = {2013}, month = {2013}, chapter = {670}, keywords = {small area estimation}, author = {Rebecca C. Steorts and Malay Ghosh} } @mastersthesis {Stuart2013, title = {User Modeling via Machine Learning and Rule-based Reasoning to Understand and Predict Errors in Survey Systems}, year = {2013}, school = {University of Nebraska-Lincoln}, type = {Masters}, url = {http://digitalcommons.unl.edu/computerscidiss/70/}, author = {Stuart, Leonard Cleve} } @article {spielman2013using, title = {Using High Resolution Population Data to Identify Neighborhoods and Determine their Boundaries}, journal = {Annals of the Association of American Geographers}, volume = {103}, number = {1}, year = {2013}, pages = {67-84}, doi = {10.1080/00045608.2012.685049}, url = {http://www.tandfonline.com/doi/abs/10.1080/00045608.2012.685049}, author = {Spielman, S. E. and Logan, J.} } @conference {2134, title = {What are you doing now?: Audit trails, Activity level responses and error in the American Time Use Survey}, booktitle = {American Association for Public Opinion Research}, year = {2013}, address = {Boston, MA}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {T. Al Baghal and Phillips, A.L. and Ruther, N. and Belli, R.F. and Stuart, L. and Eck, A. and Soh, L-K} } @conference {hal:ste:fie:2012, title = {Bayesian Parametric and Nonparametric Inference for Multiple Record Likage}, booktitle = {Modern Nonparametric Methods in Machine Learning Workshop}, year = {2012}, publisher = {NIPS}, organization = {NIPS}, url = {http://www.stat.cmu.edu/NCRN/PUBLIC/files/beka_nips_finalsub4.pdf}, author = {Hall, R. and Steorts, R. and Fienberg, S. E.} } @conference {Steorts2012a, title = {On Estimation of Mean Squared Errors of Benchmarked and Empirical Bayes Estimators}, booktitle = {2012 Joint Statistical Meetings}, year = {2012}, month = {August}, address = {San Diego, CA}, author = {Rebecca C. Steorts and Malay Ghosh} } @conference {2166, title = {Exploring interviewer and respondent interactions: An innovative behavior coding approach}, booktitle = {Midwest Association for Public Opinion Research 2012 Annual Conference}, year = {2012}, address = {Chicago, IL}, url = {http://www.mapor.org/conferences.html}, author = {Walton, L. and Stange, M. and Powell, R. and Belli, R.F.} } @booklet {Shaefer2012, title = {Extreme Poverty in the United States, 1996 to 2011}, year = {2012}, note = {NCRN}, month = {February 2012}, publisher = {University of Michigan}, type = {Report}, url = {http://www.npc.umich.edu/publications/policy_briefs/brief28/policybrief28.pdf}, author = {Shaefer, H. Luke and Edin, Kathryn} } @conference {CIKM-SunSL12, title = {Fast Multi-task Learning for Query Spelling Correction}, booktitle = {The 21$^{st}$ ACM International Conference on Information and Knowledge Management (CIKM 2012)}, year = {2012}, pages = {285{\textendash}294}, doi = {10.1145/2396761.2396800}, url = {http://dx.doi.org/10.1145/2396761.2396800}, author = {Xu Sun and Anshumali Shrivastava and Ping Li} } @conference {ShrivastavaL12, title = {Fast Near Neighbor Search in High-Dimensional Binary Data}, booktitle = {The European Conference on Machine Learning (ECML 2012)}, year = {2012}, author = {Anshumali Shrivastava and Ping Li} } @techreport {Sadinle2012b, title = {A Generalized Fellegi-Sunter Framework for Multiple Record Linkage with Application to Homicide Records Systems}, number = {1205.3217}, year = {2012}, url = {https://arxiv.org/abs/1205.3217}, author = {Mauricio Sadinle and Stephen E. Fienberg} } @conference {LiSK12, title = {GPU-based minwise hashing: GPU-based minwise hashing}, booktitle = {Proceedings of the 21st World Wide Web Conference (WWW 2012) (Companion Volume)}, year = {2012}, pages = {565-566}, doi = {10.1145/2187980.2188129}, url = {http://doi.acm.org/10.1145/2187980.2188129}, author = {Ping Li and Anshumali Shrivastava and Arnd Christian K{\"o}nig} } @booklet {Sengupta2012, title = {Hierarchical Statistical Modeling of Big Spatial Datasets Using the Exponential Family of Distributions}, number = {879}, year = {2012}, publisher = {The Ohio State University}, author = {Sengupta, A. and Cressie, N.} } @conference {Sadinle2012a, title = {Logit-Based Confidence Intervals for Single Capture-Recapture Estimation}, booktitle = {American Statistical Association Pittsburgh Chapter Banquet}, year = {2012}, note = {April 9, 2012}, month = {April}, address = {Pittsburgh, PA}, author = {Mauricio Sadinle} } @conference {Shalizi-JSM2012, title = {Maintaining Quality in the Face of Rapid Program Expansion}, booktitle = {2012 Joint Statistical Meetings}, year = {2012}, month = {August}, address = {San Diego, CA}, author = {Cosma Shalizi and Rebecca Nugent} } @conference {Sadinle2012, title = {MulFiles Record Linkage Using a Generalized Fellegi-Sunter Framework}, booktitle = {Conference Presentation Classification Society Annual Meeting, Carnegie Mellon University}, year = {2012}, author = {Mauricio Sadinle} } @conference {WWW-SunSL12, title = {Query spelling correction using multi-task learning}, booktitle = {Proceedings of the 21st World Wide Web Conference (WWW 2012)(Companion Volume)}, year = {2012}, pages = {613-614}, doi = {10.1145/2187980.2188153}, url = {http://doi.acm.org/10.1145/2187980.2188153}, author = {Xu Sun and Anshumali Shrivastava and Ping Li} } @article {SrivastavaLS12, title = {Testing for Membership to the IFRA and the NBU Classes of Distributions}, journal = {Journal of Machine Learning Research - Proceedings Track for the Fifteenth International Conference on Artificial Intelligence and Statistics (AISTATS 2012)}, volume = {22}, year = {2012}, pages = {1099-1107}, url = {http://jmlr.csail.mit.edu/proceedings/papers/v22/srivastava12.html}, author = {Radhendushka Srivastava and Ping Li and Debasis Sengupta} } @conference {Spielman2012, title = {Thinking inside the box: Mapping the microstructure of urban environment (and why it matters)}, booktitle = {AutoCarto 2012}, year = {2012}, address = {Columbus, Ohio}, keywords = {cartography}, url = {http://www.cartogis.org/docs/proceedings/2012/Spielman_etal_AutoCarto2012.pdf}, author = {Seth Spielman and David Folch and John Logan and Nicholas Nagle} } @article {ShaeferYbarra2012, title = {The welfare reforms of the 1990s and the stratification of material well-being among low-income households with children}, journal = {Children and Youth Services Review}, volume = {34}, number = {8}, year = {2012}, note = {NCRN}, pages = {1810-1817}, type = {Journal Article}, abstract = {

We examine the incidence of material hardship experienced by low-income households with children, before and after the major changes to U.S. anti-poverty programs during the 1990s. We use the Survey of Income and ProgramParticipation (SIPP) to examine a series of measures of householdmaterial hardship thatwere collected in the years 1992, 1995, 1998, 2003 and 2005.We stratify our sample to differentiate between the 1) deeply poor (b50\% of poverty), who sawa decline in public assistance over this period; and two groups that sawsome forms of public assistance increase: 2) other poor households (50{\textendash}99\% of poverty), and 3) the near poor (100{\textendash}150\% of poverty). We report bivariate trends over the study period, as well as presenting multivariate difference-indifferences estimates.We find suggestive evidence that material hardship{\textemdash}in the form of difficulty meeting essential household expenses, and falling behind on utilities costs{\textemdash}has generally increased among the deeply poor but has remained roughly the same for the middle group (50{\textendash}99\% of poverty), and decreased among the near poor (100{\textendash}150\% of poverty). Multivariate difference-in-differences estimates suggest that these trends have resulted in intensified stratification of the material well-being of low-income households with children.

}, author = {Shaefer, H. Luke and Ybarra, Marci} } @conference {sad:hal:fie:2011, title = {Approaches to Multiple Record Linkage}, booktitle = {Proceedings of the 58th World Statistical Congress}, year = {2011}, pages = {1064{\textendash}1071}, publisher = {International Statistical Institute}, organization = {International Statistical Institute}, address = {Dublin}, url = {http://2011.isiproceedings.org/papers/450092.pdf}, author = {Sadinle, M. and Hall, R. and Fienberg, S. E.} } @techreport {handle:1813:34516, title = {Do Single Mothers in the United States use the Earned Income Tax Credit to Reduce Unsecured Debt?}, number = {1813:34516}, year = {2011}, institution = {University of Michigan}, type = {Preprint}, abstract = {Do Single Mothers in the United States use the Earned Income Tax Credit to Reduce Unsecured Debt? Shaefer, H. Luke; Song, Xiaoqing; Williams Shanks, Trina R. The Earned Income Tax Credit (EITC) is a refundable credit for low-income workers that is mainly targeted at families with children. This study uses the Survey of Income and Program Participation{\textquoteright}s (SIPP) topical modules on Assets \& Liabilities to examine the effects of EITC expansions during the early 1990s on the unsecured debt of the households of single mothers. We use two difference-in-differences comparisons over the study period 1988 to 1999, first comparing single mothers to single childless women, and then comparing single mothers with two or more children to single mothers with exactly one child. In both cases we find that the EITC expansions are associated with a relative decline in the unsecured debt of affected households of single mothers. This suggests that single mothers may have used part of their EITC to limit the growth of their unsecured debt during this period.}, url = {http://hdl.handle.net/1813/34516}, author = {Shaefer, H. Luke and Song, Xiaoqing and Williams Shanks, Trina R.} } @booklet {2518, title = {Are Self-Description Scales Better than Agree/Disagree Scales in Mail and Telephone Surveys?}, author = {Timbrook, Jerry and Smyth, Jolene D. and Olson, Kristen} } @booklet {2519, title = {Are Self-Description Scales Better than Agree/Disagree Scales in Mail and Telephone Surveys?}, author = {Timbrook, Jerry and Smyth, Jolene D. and Olson, Kristen} } @article {2263, title = {Bayesian estimation of bipartite matchings for record linkage}, journal = {Journal of the American Statistical Association}, abstract = {The bipartite record linkage task consists of merging two disparate datafiles containing information on two overlapping sets of entities. This is non-trivial in the absence of unique identifiers and it is important for a wide variety of applications given that it needs to be solved whenever we have to combine information from different sources. Most statistical techniques currently used for record linkage are derived from a seminal paper by Fellegi and Sunter (1969). These techniques usually assume independence in the matching statuses of record pairs to derive estimation procedures and optimal point estimators. We argue that this independence assumption is unreasonable and instead target a bipartite matching between the two datafiles as our parameter of interest. Bayesian implementations allow us to quantify uncertainty on the matching decisions and derive a variety of point estimators using different loss functions. We propose partial Bayes estimates that allow uncertain parts of the bipartite matching to be left unresolved. We evaluate our approach to record linkage using a variety of challenging scenarios and show that it outperforms the traditional methodology. We illustrate the advantages of our methods merging two datafiles on casualties from the civil war of El Salvador.}, author = {Mauricio Sadinle} } @article {2662, title = {Biomass prediction using density dependent diameter distribution models}, journal = {Annals of Applied Statistics}, volume = {11}, pages = {340-361}, abstract = {Prediction of aboveground biomass, particularly at large spatial scales, is necessary for estimating global-scale carbon sequestration. Since biomass can be measured only by sacrificing trees, total biomass on plots is never observed. Rather, allometric equations are used to convert individual tree diameter to individual biomass, perhaps with noise. The values for all trees on a plot are then summed to obtain a derived total biomass for the plot. Then, with derived total biomasses for a collection of plots, regression models, using appropriate environmental covariates, are employed to attempt explanation and prediction. Not surprisingly, when out-of-sample validation is examined, such a model will predict total biomass well for holdout data because it is obtained using exactly the same derived approach. Apart from the somewhat circular nature of the regression approach, it also fails to employ the actual observed plot level response data. At each plot, we observe a random number of trees, each with an associated diameter, producing a sample of diameters. A model based on this random number of tree diameters provides understanding of how environmental regressors explain abundance of individuals, which in turn explains individual diameters. We incorporate density dependence because the distribution of tree diameters over a plot of fixed size depends upon the number of trees on the plot. After fitting this model, we can obtain predictive distributions for individual-level biomass and plot-level total biomass. We show that predictive distributions for plot-level biomass obtained from a density-dependent model for diameters will be much different from predictive distributions using the regression approach. Moreover, they can be more informative for capturing uncertainty than those obtained from modeling derived plot-level biomass directly. We develop a density-dependent diameter distribution model and illustrate with data from the national Forest Inventory and Analysis (FIA) database. We also describe how to scale predictions to larger spatial regions. Our predictions agree (in magnitude) with available wisdom on mean and variation in biomass at the hectare scale.}, url = {https://projecteuclid.org/euclid.aoas/1491616884}, author = {Schliep, E.M. and A.E. Gelfand and J.S. Clark and B.J. Tomasek} } @booklet {2512, title = {"During the LAST YEAR, Did You...": The Effect of Emphasis in CATI Survey Questions on Data Quality}, author = {Olson, Kristen and Smyth, Jolene D.} } @booklet {2511, title = {"During the LAST YEAR, Did You...": The Effect of Emphasis in CATI Survey Questions on Data Quality}, author = {Olson, Kristen and Smyth, Jolene D.} } @booklet {2510, title = {The Effect of Question Characteristics, Respondents and Interviewers on Question Reading Time and Question Reading Behaviors in CATI Surveys}, author = {Olson, Kristen and Smyth, Jolene and Kirchner, Antje} } @booklet {2503, title = {The Effects of Respondent and Question Characteristics on Respondent Behaviors}, author = {Ganshert, Amanda and Olson, Kristen and Smyth, Jolene} } @booklet {2506, title = {Going off Script: How Interviewer Behavior Affects Respondent Behaviors in Telephone Surveys}, author = {Kirchner, Antje and Olson, Kristen and Smyth, Jolene} } @booklet {2514, title = {How do Low Versus High Response Scale Ranges Impact the Administration and Answering of Behavioral Frequency Questions in Telephone Surveys?}, author = {Sarwar, Mazen and Olson, Kristen and Smyth, Jolene} } @booklet {2516, title = {How do Mismatches Affect Interviewer/Respondent Interactions in the Question/Answer Process?}, author = {Smyth, Jolene D. and Olson, Kristen} } @booklet {2501, title = {Interviewer Influence on Interviewer-Respondent Interaction During Battery Questions}, author = {Cochran, Beth and Olson, Kristen and Smyth, Jolene} } @booklet {2513, title = {Response Scales: Effects on Data Quality for Interviewer Administered Surveys}, author = {Sarwar, Mazen and Olson, Kristen and Smyth, Jolene} } @booklet {2530, title = {Using audit trails to evaluate an event history calendar survey instrument}, author = {Lee, Jinyoung and Seloske, Ben and Belli, Robert F.} } @booklet {2517, title = {Why do Mobile Interviews Take Longer? A Behavior Coding Perspective}, author = {Timbrook, Jerry and Smyth, Jolene and Olson, Kristen} } @booklet {2531, title = {Working with the SIPP-EHC audit trails: Parallel and sequential retrieval}, author = {Lee, Jinyoung and Seloske, Ben and C{\'o}rdova Cazar, Ana Luc{\'\i}a and Eck, Adam and Belli, Robert F.} }