@article {2566, title = {Data fusion for correcting measurement errors}, year = {Submitted}, abstract = {Often in surveys, key items are subject to measurement errors. Given just the data, it can be difficult to determine the distribution of this error process, and hence to obtain accurate inferences that involve the error-prone variables. In some settings, however, analysts have access to a data source on different individuals with high quality measurements of the error-prone survey items. We present a data fusion framework for leveraging this information to improve inferences in the error-prone survey. The basic idea is to posit models about the rates at which individuals make errors, coupled with models for the values reported when errors are made. This can avoid the unrealistic assumption of conditional independence typically used in data fusion. We apply the approach on the reported values of educational attainments in the American Community Survey, using the National Survey of College Graduates as the high quality data source. In doing so, we account for the informative sampling design used to select the National Survey of College Graduates. We also present a process for assessing the sensitivity of various analyses to different choices for the measurement error models. Supplemental material is available online.}, author = {J. P. Reiter and T. Schifeling and M. De Yoreo} } @article {2564, title = {A framework for sharing confidential research data, applied to investigating differential pay by race in the U. S. government}, year = {Submitted}, abstract = {Data stewards seeking to provide access to large-scale social science data face a difficult challenge. They have to share data in ways that protect privacy and confidentiality, are informative for many analyses and purposes, and are relatively straightforward to use by data analysts. We present a framework for addressing this challenge. The framework uses an integrated system that includes fully synthetic data intended for wide access, coupled with means for approved users to access the confidential data via secure remote access solutions, glued together by verification servers that allow users to assess the quality of their analyses with the synthetic data. We apply this framework to data on the careers of employees of the U. S. federal government, studying differentials in pay by race. The integrated system performs as intended, allowing users to explore the synthetic data for potential pay differentials and learn through verifications which findings in the synthetic data hold up in the confidential data and which do not. We find differentials across races; for example, the gap between black and white female federal employees{\textquoteright} pay increased over the time period. We present models for generating synthetic careers and differentially private algorithms for verification of regression results. }, author = {Barrientos, A. F. and Bolton, A. and Balmat, T. and Reiter, J. P. and Machanavajjhala, A. and Chen, Y. and Kneifel, C. and DeLong, M. and de Figueiredo, J. M.} } @article {2559, title = {Imputation in U.S. Manufacturing Data and Its Implications for Productivity Dispersion}, journal = {Review of Economics and Statistics}, year = {Submitted}, abstract = {In the U.S. Census Bureau{\textquoteright}s 2002 and 2007 Censuses of Manufactures 79\% and 73\% of observations respectively have imputed data for at least one variable used to compute total factor productivity. The Bureau primarily imputes for missing values using mean-imputation methods which can reduce the true underlying variance of the imputed variables. For every variable entering TFP in 2002 and 2007 we show the dispersion is significantly smaller in the Census mean-imputed versus the Census non-imputed data. As an alternative to mean imputation we show how to use classification and regression trees (CART) to allow for a distribution of multiple possible impute values based on other plants that are CART-algorithmically determined to be similar based on other observed variables. For 90\% of the 473 industries in 2002 and the 84\% of the 471 industries in 2007 we find that TFP dispersion increases as we move from Census mean-imputed data to Census non-imputed data to the CART-imputed data.}, doi = {10.1162/REST_a_00678 }, url = {http://www.mitpressjournals.org/doi/abs/10.1162/REST_a_00678}, author = {T. Kirk White and Jerome P. Reiter and Amil Petrin} } @article {2558, title = {Sequential identification of nonignorable missing data mechanisms}, journal = {Statistica Sinica}, year = {Submitted}, month = {01/2017}, abstract = {With nonignorable missing data, likelihood-based inference should be based on the joint distribution of the study variables and their missingness indicators. These joint models cannot be estimated from the data alone, thus requiring the analyst to impose restrictions that make the models uniquely obtainable from the distribution of the observed data. We present an approach for constructing classes of identifiable nonignorable missing data models. The main idea is to use a sequence of carefully set up identifying assumptions, whereby we specify potentially different missingness mechanisms for different blocks of variables. We show that the procedure results in models with the desirable property of being non-parametric saturated.}, keywords = {Identification, Missing not at random, Non-parametric saturated, Partial ignorability, Sensitivity analysis}, doi = {10.5705/ss.202016.0328}, author = {Mauricio Sadinle and Jerome P. Reiter} } @article {2634, title = {The Earned Income Tax Credit and Food Insecurity: Who Benefits?}, year = {forthcoming}, author = {Shaefer, H.L. and Wilson, R.} } @article {2635, title = {The Response of Consumer Spending to Changes in Gasoline Prices}, year = {forthcoming}, abstract = {This paper estimates how overall consumer spending responds to changes in gasoline prices. It uses the differential impact across consumers of the sudden, large drop in gasoline prices in 2014 for identification. This estimation strategy is implemented using comprehensive, daily transaction-level data for a large panel of individuals. The estimated marginal propensity to consume (MPC) is approximately one, a higher estimate than estimates found in less comprehensive or well-measured data. This estimate takes into account the elasticity of demand for gasoline and potential slow adjustment to changes in prices. The high MPC implies that changes in gasoline prices have large aggregate effects.}, author = {Gelman, Michael and Gorodnichenko, Yuriy and Kariv, Shachar and Koustas, Dmitri and Shapiro, Matthew D and Silverman, Daniel and Tadelis, Steven} } @article {2636, title = {Understanding Household Consumption and Saving Behavior using Account Data}, year = {forthcoming}, author = {Gelman, Michael} } @article {jole2018, title = {Earnings Inequality and Mobility Trends in the United States: Nationally Representative Estimates from Longitudinally Linked Employer-Employee Data}, journal = {Journal of Labor Economics}, year = {2018}, abstract = {Using earnings data from the U.S. Census Bureau, this paper analyzes the role of the employer in explaining the rise in earnings inequality in the United States. We first establish a consistent frame of analysis appropriate for administrative data used to study earnings inequality. We show that the trends in earnings inequality in the administrative data from the Longitudinal Employer-Household Dynamics Program are inconsistent with other data sources when we do not correct for the presence of misused SSNs. After this correction to the worker frame, we analyze how the earnings distribution has changed in the last decade. We present a decomposition of the year-to-year changes in the earnings distribution from 2004-2013. Even when simplifying these flows to movements between the bottom 20\%, the middle 60\% and the top 20\% of the earnings distribution, about 20.5 million workers undergo a transition each year. Another 19.9 million move between employment and nonemployment. To understand the role of the firm in these transitions, we estimate a model for log earnings with additive fixed worker and firm effects using all jobs held by eligible workers from 2004-2013. We construct a composite log earnings firm component across all jobs for a worker in a given year and a non-firm component. We also construct a skill-type index. We show that, while the difference between working at a low- or middle-paying firm are relatively small, the gains from working at a top-paying firm are large. Specifically, the benefits of working for a high-paying firm are not only realized today, through higher earnings paid to the worker, but also persist through an increase in the probability of upward mobility. High-paying firms facilitate moving workers to the top of the earnings distribution and keeping them there.}, author = {John M. Abowd and Kevin L. Mckinney and Nellie Zhao} } @article {annalsSorting, title = {Sorting Between and Within Industries: A Testable Model of Assortative Matching}, journal = {Annals of Economics and Statistics}, year = {2018}, issn = {21154430, 19683863}, author = {John M. Abowd and Francis Kramarz and Sebastien Perez-Duarte and Ian M. Schmutte} } @article {2663, title = {Adaptively-Tuned Particle Swarm Optimization with Application to Spatial Design}, journal = {Stat}, volume = {6}, year = {2017}, pages = {145{\textendash}159}, abstract = {Particle swarm optimization (PSO) algorithms are a class of heuristic optimization algorithms that are attractive for complex optimization problems. We propose using PSO to solve spatial design problems, e.g. choosing new locations to add to an existing monitoring network. Additionally, we introduce two new classes of PSO algorithms that perform well in a wide variety of circumstances, called adaptively tuned PSO and adaptively tuned bare bones PSO. To illustrate these algorithms, we apply them to a common spatial design problem: choosing new locations to add to an existing monitoring network. Specifically, we consider a network in the Houston, TX, area for monitoring ambient ozone levels, which have been linked to out-of-hospital cardiac arrest rates. Published 2017. This article has been contributed to by US Government employees and their work is in the public domain in the USA}, doi = {10.1002/sta4.142}, url = {http://onlinelibrary.wiley.com/doi/10.1002/sta4.142/abstract}, author = {Simpson, M. and Wikle, C.K. and Holan, S.H.} } @article {2561, title = {Bayesian estimation of bipartite matchings for record linkage}, journal = { Journal of the American Statistical Association}, volume = {112}, year = {2017}, month = {03/2017}, chapter = {600}, abstract = {The bipartite record linkage task consists of merging two disparate datafiles containing information on two overlapping sets of entities. This is non-trivial in the absence of unique identifiers and it is important for a wide variety of applications given that it needs to be solved whenever we have to combine information from different sources. Most statistical techniques currently used for record linkage are derived from a seminal paper by Fellegi and Sunter (1969). These techniques usually assume independence in the matching statuses of record pairs to derive estimation procedures and optimal point estimators. We argue that this independence assumption is unreasonable and instead target a bipartite matching between the two datafiles as our parameter of interest. Bayesian implementations allow us to quantify uncertainty on the matching decisions and derive a variety of point estimators using different loss functions. We propose partial Bayes estimates that allow uncertain parts of the bipartite matching to be left unresolved. We evaluate our approach to record linkage using a variety of challenging scenarios and show that it outperforms the traditional methodology. We illustrate the advantages of our methods merging two datafiles on casualties from the civil war of El Salvador.}, keywords = {Assignment problem, Bayes estimate, Data matching, Fellegi-Sunter decision rule, Mixture model, Rejection option}, doi = {10.1080/01621459.2016.1148612}, url = {http://amstat.tandfonline.com/doi/abs/10.1080/01621459.2016.1148612}, author = {Mauricio Sadinle} } @article {2664, title = {Bayesian Hierarchical Multi-Population Multistate Jolly-Seber Models with Covariates: Application to the Pallid Sturgeon Population Assessment Program}, journal = {Journal of the American Statistical Association}, volume = {112}, year = {2017}, pages = {471-483}, abstract = {Estimating abundance for multiple populations is of fundamental importance to many ecological monitoring programs. Equally important is quantifying the spatial distribution and characterizing the migratory behavior of target populations within the study domain. To achieve these goals, we propose a Bayesian hierarchical multi-population multistate Jolly{\textendash}Seber model that incorporates covariates. The model is proposed using a state-space framework and has several distinct advantages. First, multiple populations within the same study area can be modeled simultaneously. As a consequence, it is possible to achieve improved parameter estimation by {\textquotedblleft}borrowing strength{\textquotedblright} across different populations. In many cases, such as our motivating example involving endangered species, this borrowing of strength is crucial, as there is relatively less information for one of the populations under consideration. Second, in addition to accommodating covariate information, we develop a computationally efficient Markov chain Monte Carlo algorithm that requires no tuning. Importantly, the model we propose allows us to draw inference on each population as well as on multiple populations simultaneously. Finally, we demonstrate the effectiveness of our method through a motivating example of estimating the spatial distribution and migration of hatchery and wild populations of the endangered pallid sturgeon (Scaphirhynchus albus), using data from the Pallid Sturgeon Population Assessment Program on the Lower Missouri River. Supplementary materials for this article are available online.}, doi = {10.1080/01621459.2016.1211531}, url = {http://www.tandfonline.com/doi/abs/10.1080/01621459.2016.1211531}, author = {Wu, G. and Holan, S.H.} } @article {2658, title = {The Cepstral Model for Multivariate Time Series: The Vector Exponential Model}, journal = {Statistica Sinica}, volume = {27}, year = {2017}, pages = {23-42}, abstract = {Vector autoregressive (VAR) models have become a staple in the analysis of multivariate time series and are formulated in the time domain as difference equations, with an implied covariance structure. In many contexts, it is desirable to work with a stable, or at least stationary, representation. To fit such models, one must impose restrictions on the coefficient matrices to ensure that certain determinants are nonzero; which, except in special cases, may prove burdensome. To circumvent these difficulties, we propose a flexible frequency domain model expressed in terms of the spectral density matrix. Specifically, this paper treats the modeling of covariance stationary vector-valued (i.e., multivariate) time series via an extension of the exponential model for the spectrum of a scalar time series. We discuss the modeling advantages of the vector exponential model and its computational facets, such as how to obtain Wold coefficients from given cepstral coefficients. Finally, we demonstrate the utility of our approach through simulation as well as two illustrative data examples focusing on multi-step ahead forecasting and estimation of squared coherence.}, keywords = {Autocovariance matrix, Bayesian estimation, Cepstral, Coherence, Spectral density matrix, stochastic search variable selection, Wold coefficients.}, doi = {10.5705/ss.202014.0024}, url = {http://www3.stat.sinica.edu.tw/statistica/J27N1/J27N12/J27N12.html}, author = {Holan, S.H. and McElroy, T.S. and Wu, G.} } @techreport {2655, title = {Computationally Efficient Multivariate Spatio-Temporal Models for High-Dimensional Count-Valued Data. (With Discussion).}, number = {1512.07273}, year = {2017}, abstract = {We introduce a Bayesian approach for multivariate spatio-temporal prediction for high-dimensional count-valued data. Our primary interest is when there are possibly millions of data points referenced over different variables, geographic regions, and times. This problem requires extensive methodological advancements, as jointly modeling correlated data of this size leads to the so-called "big n problem." The computational complexity of prediction in this setting is further exacerbated by acknowledging that count-valued data are naturally non-Gaussian. Thus, we develop a new computationally efficient distribution theory for this setting. In particular, we introduce a multivariate log-gamma distribution and provide substantial theoretical development including: results regarding conditional distributions, marginal distributions, an asymptotic relationship with the multivariate normal distribution, and full-conditional distributions for a Gibbs sampler. To incorporate dependence between variables, regions, and time points, a multivariate spatio-temporal mixed effects model (MSTM) is used. The results in this manuscript are extremely general, and can be used for data that exhibit fewer sources of dependency than what we consider (e.g., multivariate, spatial-only, or spatio-temporal-only data). Hence, the implications of our modeling framework may have a large impact on the general problem of jointly modeling correlated count-valued data. We show the effectiveness of our approach through a simulation study. Additionally, we demonstrate our proposed methodology with an important application analyzing data obtained from the Longitudinal Employer-Household Dynamics (LEHD) program, which is administered by the U.S. Census Bureau.}, keywords = {Aggregation, American Community Survey, Bayesian hierarchical model, Big Data, Longitudinal Employer-Household Dynamics (LEHD) program, Markov chain Monte Carlo, Non-Gaussian., Quarterly Workforce Indicators}, url = {https://arxiv.org/abs/1512.07273}, author = {Bradley, J.R. and Holan, S.H. and Wikle, C.K.} } @article {2490, title = {Cost-Benefit Analysis for a Quinquennial Census: The 2016 Population Census of South Africa}, journal = {Journal of Official Statistics}, volume = {33}, year = {2017}, month = {02/2017}, abstract = {The question of whether to carry out a quinquennial Census is faced by national statistical offices in increasingly many countries, including Canada, Nigeria, Ireland, Australia, and South Africa. We describe uses and limitations of cost-benefit analysis in this decision problem in the case of the 2016 Census of South Africa. The government of South Africa needed to decide whether to conduct a 2016 Census or to rely on increasingly inaccurate postcensal estimates accounting for births, deaths, and migration since the previous (2011) Census. The cost-benefit analysis compared predicted costs of the 2016 Census to the benefits of improved allocation of intergovernmental revenue, which was considered by the government to be a critical use of the 2016 Census, although not the only important benefit. Without the 2016 Census, allocations would be based on population estimates. Accuracy of the postcensal estimates was estimated from the performance of past estimates, and the hypothetical expected reduction in errors in allocation due to the 2016 Census was estimated. A loss function was introduced to quantify the improvement in allocation. With this evidence, the government was able to decide not to conduct the 2016 Census, but instead to improve data and capacity for producing post-censal estimates.}, keywords = {demographic statistics, fiscal allocations, loss function, population estimates, post-censal estimates}, isbn = { 2001-7367}, doi = {10.1515/jos-2017-0013}, url = {https://www.degruyter.com/view/j/jos.2017.33.issue-1/jos-2017-0013/jos-2017-0013.xml}, author = {Spencer, Bruce D. and May, Julian and Kenyon, Steven and Seeskin, Zachary} } @conference {synthdiagicdm, title = {Differentially private regression diagnostics}, booktitle = {IEEE International Conference on Data Mining}, year = {2017}, abstract = {Many data producers seek to provide users access to confidential data without unduly compromising data subjects{\textquoteright} privacy and confidentiality. When intense redaction is needed to do so, one general strategy is to require users to do analyses without seeing the confidential data, for example, by releasing fully synthetic data or by allowing users to query remote systems for disclosure-protected outputs of statistical models. With fully synthetic data or redacted outputs, the analyst never really knows how much to trust the resulting findings. In particular, if the user did the same analysis on the confidential data, would regression coefficients of interest be statistically significant or not? We present algorithms for assessing this question that satisfy differential privacy. We describe conditions under which the algorithms should give accurate answers about statistical significance. We illustrate the properties of the methods using artificial and genuine data.}, author = {Chen, Y. and Machanavajjhala, A. and Reiter, J. P. and Barrientos, A.} } @article {Hu2017-nm, title = {Dirichlet Process Mixture Models for Modeling and Generating Synthetic Versions of Nested Categorical Data}, journal = {Bayesian Analysis}, year = {2017}, month = {24 January 2017}, abstract = {We present a Bayesian model for estimating the joint distribution of multivariate categorical data when units are nested within groups. Such data arise frequently in social science settings, for example, people living in households. The model assumes that (i) each group is a member of a group-level latent class, and (ii) each unit is a member of a unit-level latent class nested within its group-level latent class. This structure allows the model to capture dependence among units in the same group. It also facilitates simultaneous modeling of variables at both group and unit levels. We develop a version of the model that assigns zero probability to groups and units with physically impossible combinations of variables. We apply the model to estimate multivariate relationships in a subset of the American Community Survey. Using the estimated model, we generate synthetic household data that could be disseminated as redacted public use files. Supplementary materials (Hu et al., 2017) for this article are available online.}, doi = {10.1214/16-BA1047}, url = {http://projecteuclid.org/euclid.ba/1485227030}, author = {Hu, Jingchen and Reiter, Jerome P and Wang, Quanli} } @article {2507, title = {Do Interviewer Post-survey Evaluations of Respondents Measure Who Respondents Are or What They Do? A Behavior Coding Study}, journal = {Public Opinion Quarterly}, year = {2017}, month = {08/2017}, abstract = {Survey interviewers are often tasked with assessing the quality of respondents{\textquoteright} answers after completing a survey interview. These interviewer observations have been used to proxy for measurement error in interviewer-administered surveys. How interviewers formulate these evaluations and how well they proxy for measurement error has received little empirical attention. According to dual-process theories of impression formation, individuals form impressions about others based on the social categories of the observed person (e.g., sex, race) and individual behaviors observed during an interaction. Although initial impressions start with heuristic, rule-of-thumb evaluations, systematic processing is characterized by extensive incorporation of available evidence. In a survey context, if interviewers default to heuristic information processing when evaluating respondent engagement, then we expect their evaluations to be primarily based on respondent characteristics and stereotypes associated with those characteristics. Under systematic processing, on the other hand, interviewers process and evaluate respondents based on observable respondent behaviors occurring during the question-answering process. We use the Work and Leisure Today Survey, including survey data and behavior codes, to examine proxy measures of heuristic and systematic processing by interviewers as predictors of interviewer postsurvey evaluations of respondents{\textquoteright} cooperativeness, interest, friendliness, and talkativeness. Our results indicate that CATI interviewers base their evaluations on actual behaviors during an interview (i.e., systematic processing) rather than perceived characteristics of the respondent or the interviewer (i.e., heuristic processing). These results are reassuring for the many surveys that collect interviewer observations as proxies for data quality.}, doi = {10.1093/poq/nfx026}, url = {https://doi.org/10.1093/poq/nfx026}, author = {Kirchner, Antje and Olson, Kristen and Smyth, Jolene D.} } @article {2629, title = {Dynamic Question Ordering in Online Surveys}, journal = {Journal of Official Statistics}, volume = {33}, year = {2017}, abstract = {Online surveys have the potential to support adaptive questions, where later questions depend on earlier responses. Past work has taken a rule-based approach, uniformly across all respondents. We envision a richer interpretation of adaptive questions, which we call dynamic question ordering (DQO), where question order is personalized. Such an approach could increase engagement, and therefore response rate, as well as imputation quality. We present a DQO framework to improve survey completion and imputation. In the general survey-taking setting, we want to maximize survey completion, and so we focus on ordering questions to engage the respondent and collect hopefully all information, or at least the information that most characterizes the respondent, for accurate imputations. In another scenario, our goal is to provide a personalized prediction. Since it is possible to give reasonable predictions with only a subset of questions, we are not concerned with motivating users to answer all questions. Instead, we want to order questions to get information that reduces prediction uncertainty, while not being too burdensome. We illustrate this framework with an example of providing energy estimates to prospective tenants. We also discuss DQO for national surveys and consider connections between our statistics-based question-ordering approach and cognitive survey methodology.}, doi = {https://doi.org/10.1515/jos-2017-0030}, author = {Early, Kirstin and Mankoff, Jennifer and Fienberg, Stephen E.} } @techreport {2568, title = {Earnings Inequality and Mobility Trends in the United States: Nationally Representative Estimates from Longitudinally Linked Employer-Employee Data}, year = {2017}, abstract = {Using earnings data from the U.S. Census Bureau, this paper analyzes the role of the employer in explaining the rise in earnings inequality in the United States. We first establish a consistent frame of analysis appropriate for administrative data used to study earnings inequality. We show that the trends in earnings inequality in the administrative data from the Longitudinal Employer-Household Dynamics Program are inconsistent with other data sources when we do not correct for the presence of misused SSNs. After this correction to the worker frame, we analyze how the earnings distribution has changed in the last decade. We present a decomposition of the year-to-year changes in the earnings distribution from 2004-2013. Even when simplifying these flows to movements between the bottom 20\%, the middle 60\% and the top 20\% of the earnings distribution, about 20.5 million workers undergo a transition each year. Another 19.9 million move between employment and nonemployment. To understand the role of the firm in these transitions, we estimate a model for log earnings with additive fixed worker and firm effects using all jobs held by eligible workers from 2004-2013. We construct a composite log earnings firm component across all jobs for a worker in a given year and a non-firm component. We also construct a skill-type index. We show that, while the difference between working at a low- or middle-paying firm are relatively small, the gains from working at a top-paying firm are large. Specifically, the benefits of working for a high-paying firm are not only realized today, through higher earnings paid to the worker, but also persist through an increase in the probability of upward mobility. High-paying firms facilitate moving workers to the top of the earnings distribution and keeping them there.}, url = {http://digitalcommons.ilr.cornell.edu/ldi/34/}, author = {John M. Abowd and Kevin L. Mckinney and Nellie Zhao} } @techreport {handle:1813:52609, title = {Earnings Inequality and Mobility Trends in the United States: Nationally Representative Estimates from Longitudinally Linked Employer-Employee Data}, number = {1813:52609}, year = {2017}, institution = {Cornell University}, type = {Preprint}, abstract = {Earnings Inequality and Mobility Trends in the United States: Nationally Representative Estimates from Longitudinally Linked Employer-Employee Data Abowd, John M.; McKinney, Kevin L.; Zhao, Nellie Using earnings data from the U.S. Census Bureau, this paper analyzes the role of the employer in explaining the rise in earnings inequality in the United States. We first establish a consistent frame of analysis appropriate for administrative data used to study earnings inequality. We show that the trends in earnings inequality in the administrative data from the Longitudinal Employer-Household Dynamics Program are inconsistent with other data sources when we do not correct for the presence of misused SSNs. After this correction to the worker frame, we analyze how the earnings distribution has changed in the last decade. We present a decomposition of the year-to-year changes in the earnings distribution from 2004-2013. Even when simplifying these flows to movements between the bottom 20\%, the middle 60\% and the top 20\% of the earnings distribution, about 20.5 million workers undergo a transition each year. Another 19.9 million move between employment and nonemployment. To understand the role of the firm in these transitions, we estimate a model for log earnings with additive fixed worker and firm effects using all jobs held by eligible workers from 2004-2013. We construct a composite log earnings firm component across all jobs for a worker in a given year and a non-firm component. We also construct a skill-type index. We show that, while the difference between working at a low- or middle-paying firm are relatively small, the gains from working at a top-paying firm are large. Specifically, the benefits of working for a high-paying firm are not only realized today, through higher earnings paid to the worker, but also persist through an increase in the probability of upward mobility. High-paying firms facilitate moving workers to the top of the earnings distribution and keeping them there.}, url = {http://hdl.handle.net/1813/52609}, author = {Abowd, John M. and McKinney, Kevin L. and Zhao, Nellie} } @techreport {handle:1813:52650, title = {Effects of a Government-Academic Partnership: Has the NSF-Census Bureau Research Network Helped Secure the Future of the Federal Statistical System?}, number = {1813:52650}, year = {2017}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {

Effects of a Government-Academic Partnership: Has the NSF-Census Bureau Research Network Helped Secure the Future of the Federal Statistical System? Weinberg, Daniel; Abowd, John M.; Belli, Robert F.; Cressie, Noel; Folch, David C.; Holan, Scott H.; Levenstein, Margaret C.; Olson, Kristen M.; Reiter, Jerome P.; Shapiro, Matthew D.; Smyth, Jolene; Soh, Leen-Kiat; Spencer, Bruce; Spielman, Seth E.; Vilhuber, Lars; Wikle, Christopher The National Science Foundation-Census Bureau Research Network (NCRN) was established in 2011 to create interdisciplinary research nodes on methodological questions of interest and significance to the broader research community and to the Federal Statistical System (FSS), particularly the Census Bureau. The activities to date have covered both fundamental and applied statistical research and have focused at least in part on the training of current and future generations of researchers in skills of relevance to surveys and alternative measurement of economic units, households, and persons. This paper discusses some of the key research findings of the eight nodes, organized into six topics: (1) Improving census and survey data collection methods; (2) Using alternative sources of data; (3) Protecting privacy and confidentiality by improving disclosure avoidance; (4) Using spatial and spatio-temporal statistical modeling to improve estimates; (5) Assessing data cost and quality tradeoffs; and (6) Combining information from multiple sources. It also reports on collaborations across nodes and with federal agencies, new software developed, and educational activities and outcomes. The paper concludes with an evaluation of the ability of the FSS to apply the NCRN{\textquoteright}s research outcomes and suggests some next steps, as well as the implications of this research-network model for future federal government renewal initiatives. This paper began as a May 8, 2015 presentation to the National Academies of Science{\textquoteright}s Committee on National Statistics by two of the principal investigators of the National Science Foundation-Census Bureau Research Network (NCRN) {\textendash} John Abowd and the late Steve Fienberg (Carnegie Mellon University). The authors acknowledge the contributions of the other principal investigators of the NCRN who are not co-authors of the paper (William Block, William Eddy, Alan Karr, Charles Manski, Nicholas Nagle, and Rebecca Nugent), the co- principal investigators, and the comments of Patrick Cantwell, Constance Citro, Adam Eck, Brian Harris-Kojetin, and Eloise Parker. We note with sorrow the deaths of Stephen Fienberg and Allan McCutcheon, two of the original NCRN principal investigators. The principal investigators also wish to acknowledge Cheryl Eavey{\textquoteright}s sterling grant administration on behalf of the NSF. The conclusions reached in this paper are not the responsibility of the National Science Foundation (NSF), the Census Bureau, or any of the institutions to which the authors belong

}, url = {http://hdl.handle.net/1813/52650}, author = {Weinberg, Daniel and Abowd, John M. and Belli, Robert F. and Cressie, Noel and Folch, David C. and Holan, Scott H. and Levenstein, Margaret C. and Olson, Kristen M. and Reiter, Jerome P. and Shapiro, Matthew D. and Smyth, Jolene and Soh, Leen-Kiat and Spencer, Bruce and Spielman, Seth E. and Vilhuber, Lars and Wikle, Christopher} } @article {2562, title = {An empirical comparison of multiple imputation methods for categorical data}, journal = {The American Statistician}, volume = {71}, year = {2017}, month = {01/2017}, chapter = {162}, abstract = {Multiple imputation is a common approach for dealing with missing values in statistical databases. The imputer fills in missing values with draws from predictive models estimated from the observed data, resulting in multiple, completed versions of the database. Researchers have developed a variety of default routines to implement multiple imputation; however, there has been limited research comparing the performance of these methods, particularly for categorical data. We use simulation studies to compare repeated sampling properties of three default multiple imputation methods for categorical data, including chained equations using generalized linear models, chained equations using classification and regression trees, and a fully Bayesian joint distribution based on Dirichlet Process mixture models. We base the simulations on categorical data from the American Community Survey. In the circumstances of this study, the results suggest that default chained equations approaches based on generalized linear models are dominated by the default regression tree and Bayesian mixture model approaches. They also suggest competing advantages for the regression tree and Bayesian mixture model approaches, making both reasonable default engines for multiple imputation of categorical data. A supplementary material for this article is available online.}, keywords = {latent, missing, mixture, nonresponse, tree}, doi = {10.1080/00031305.2016.1277158}, url = {http://www.tandfonline.com/doi/full/10.1080/00031305.2016.1277158}, author = {F. Li and O. Akande and J. P. Reiter} } @article {2505, title = {Examining Changes of Interview Length over the Course of the Field Period}, journal = {Journal of Survey Statistics and Methodology}, volume = {5}, year = {2017}, month = {2017}, pages = {84-108}, abstract = {It is well established that interviewers learn behaviors both during training and on the job. How this learning occurs has received surprisingly little empirical attention: Is it driven by the interviewer herself or by the respondents she interviews? There are two competing hypotheses about what happens during field data collection: (1) interviewers learn behaviors from their previous interviews, and thus change their behavior in reaction to the behaviors previously encountered; and (2) interviewers encounter different types of and, especially, less cooperative respondents (i.e., nonresponse propensity affecting the measurement error situation), leading to changes in interview behaviors over the course of the field period. We refer to these hypotheses as the experience and response propensity hypotheses, respectively. This paper examines the relationship between proxy indicators for the experience and response propensity hypotheses on interview length using data and paradata from two telephone surveys.Our results indicate that both interviewer-driven experience and respondent-driven response propensity are associated with the length of interview. While general interviewing experience is nonsignificant, within-study experience decreases interview length significantly, even when accounting for changes in sample composition. Interviewers with higher cooperation rates have significantly shorter interviews in study one; however, this effect is mediated by the number of words spoken by the interviewer. We find that older respondents and male respondents have longer interviews despite controlling for the number of words spoken, as do respondents who complete the survey at first contact. Not surprisingly, interviews are significantly longer the more words interviewers and respondents speak.}, isbn = {2325-0984}, url = {http://dx.doi.org/10.1093/jssam/smw031}, author = {Kirchner, Antje and Olson, Kristen} } @techreport {handle:1813:52164, title = {Formal Privacy Models and Title 13}, number = {1813:52164}, year = {2017}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {Formal Privacy Models and Title 13 Nissim, Kobbi; Gasser, Urs; Smith, Adam; Vadhan, Salil; O{\textquoteright}Brien, David; Wood, Alexandra A new collaboration between academia and the Census Bureau to further the Bureau{\textquoteright}s use of formal privacy models.}, url = {http://hdl.handle.net/1813/52164}, author = {Nissim, Kobbi and Gasser, Urs and Smith, Adam and Vadhan, Salil and O{\textquoteright}Brien, David and Wood, Alexandra} } @article {jpc:abowd:2017, title = {How Will Statistical Agencies Operate When All Data Are Private}, journal = {Journal of Privacy and Confidentiality}, volume = {7}, year = {2017}, publisher = {Cornell University}, abstract = {

How Will Statistical Agencies Operate When All Data Are Private Abowd, John M The dual problems of respecting citizen privacy and protecting the confidentiality of their data have become hopelessly conflated in the {\textquotedblleft}Big Data{\textquotedblright} era. There are orders of magnitude more data outside an agency{\textquoteright}s firewall than inside it{\textemdash}compromising the integrity of traditional statistical disclosure limitation methods. And increasingly the information processed by the agency was {\textquotedblleft}asked{\textquotedblright} in a context wholly outside the agency{\textquoteright}s operations{\textemdash}blurring the distinction between what was asked and what is published. Already, private businesses like Microsoft, Google and Apple recognize that cybersecurity (safeguarding the integrity and access controls for internal data) and privacy protection (ensuring that what is published does not reveal too much about any person or business) are two sides of the same coin. This is a paradigm-shifting moment for statistical agencies.\ 

}, url = {http://repository.cmu.edu/jpc/vol7/iss3/1/}, author = {Abowd, John M} } @article {2599, title = {Itemwise conditionally independent nonresponse modeling for incomplete multivariate data}, journal = {Biometrika }, volume = {104}, year = {2017}, month = {01/2017}, pages = {207-220}, chapter = {207}, abstract = {We introduce a nonresponse mechanism for multivariate missing data in which each study variable and its nonresponse indicator are conditionally independent given the remaining variables and their nonresponse indicators. This is a nonignorable missingness mechanism, in that nonresponse for any item can depend on values of other items that are themselves missing. We show that, under this itemwise conditionally independent nonresponse assumption, one can define and identify nonparametric saturated classes of joint multivariate models for the study variables and their missingness indicators. We also show how to perform sensitivity analysis to violations of the conditional independence assumptions encoded by this missingness mechanism. Throughout, we illustrate the use of this modeling approach with data analyses.}, keywords = {Loglinear model, Missing not at random, Missingness mechanism, Nonignorable, Nonparametric saturated, Sensitivity analysis}, doi = {10.1093/biomet/asw063}, url = {https://doi.org/10.1093/biomet/asw063}, author = {M. Sadinle and J.P. Reiter} } @article {sadinle:reiter:bmka, title = {Itemwise conditionally independent nonresponse modeling for multivariate categorical data}, journal = {Biometrika}, volume = {104}, year = {2017}, month = {01/2017}, pages = {207-220}, abstract = {With nonignorable missing data, likelihood-based inference should be based on the joint distribution of the study variables and their missingness indicators. These joint models cannot be estimated from the data alone, thus requiring the analyst to impose restrictions that make the models uniquely obtainable from the distribution of the observed data. We present an approach for constructing classes of identifiable nonignorable missing data models. The main idea is to use a sequence of carefully set up identifying assumptions, whereby we specify potentially different missingness mechanisms for different blocks of variables. We show that the procedure results in models with the desirable property of being non-parametric saturated.}, keywords = {Identification, Missing not at random, Non-parametric saturated, Partial ignorability, Sensitivity analysis}, author = {Sadinle, M. and Reiter, J. P.} } @techreport {VilhuberLagozeLDI2017, title = {Making Confidential Data Part of Reproducible Research}, number = {41}, year = {2017}, institution = {Labor Dynamics Institute, Cornell University}, type = {Document}, url = {http://digitalcommons.ilr.cornell.edu/ldi/41/}, author = {Lars Vilhuber and Carl Lagoze} } @techreport {handle:1813:52474, title = {Making Confidential Data Part of Reproducible Research}, number = {1813:52474}, year = {2017}, institution = {Cornell University}, type = {Preprint}, abstract = {Making Confidential Data Part of Reproducible Research Vilhuber, Lars; Lagoze, Carl Disclaimer and acknowledgements: While this column mentions the Census Bureau several times, any opinions and conclusions expressed herein are those of the authors and do not necessarily represent the views of the U.S. Census Bureau or the other statistical agencies mentioned herein.}, url = {http://hdl.handle.net/1813/52474}, author = {Vilhuber, Lars and Lagoze, Carl} } @article {chance:2017, title = {Making Confidential Data Part of Reproducible Research}, journal = {Chance}, year = {2017}, month = {09/2017}, url = {http://chance.amstat.org/2017/09/reproducible-research/}, author = {Vilhuber, Lars and Lagoze, Carl} } @article {doi:10.1080/07350015.2017.1356727, title = {Modeling Endogenous Mobility in Earnings Determination}, journal = {Journal of Business \& Economic Statistics}, number = {ja}, year = {2017}, pages = {0-0}, abstract = {We evaluate the bias from endogenous job mobility in fixed-effects estimates of worker- and firm-specific earnings heterogeneity using longitudinally linked employer-employee data from the LEHD infrastructure file system of the U.S. Census Bureau. First, we propose two new residual diagnostic tests of the assumption that mobility is exogenous to unmodeled determinants of earnings. Both tests reject exogenous mobility. We relax exogenous mobility by modeling the matched data as an evolving bipartite graph using a Bayesian latent-type framework. Our results suggest that allowing endogenous mobility increases the variation in earnings explained by individual heterogeneity and reduces the proportion due to employer and match effects. To assess external validity, we match our estimates of the wage components to out-of-sample estimates of revenue per worker. The mobility-bias corrected estimates attribute much more of the variation in revenue per worker to variation in match quality and worker quality than the uncorrected estimates.}, doi = {10.1080/07350015.2017.1356727}, url = {http://dx.doi.org/10.1080/07350015.2017.1356727}, author = {John M. Abowd and Kevin L. Mckinney and Ian M. Schmutte} } @techreport {2575, title = {Modeling Endogenous Mobility in Wage Determination}, year = {2017}, abstract = {We evaluate the bias from endogenous job mobility in fixed-effects estimates of worker- and firm-specific earnings heterogeneity using longitudinally linked employer-employee data from the LEHD infrastructure file system of the U.S. Census Bureau. First, we propose two new residual diagnostic tests of the assumption that mobility is exogenous to unmodeled determinants of earnings. Both tests reject exogenous mobility. We relax the exogenous mobility assumptions by modeling the evolution of the matched data as an evolving bipartite graph using a Bayesian latent class framework. Our results suggest that endogenous mobility biases estimated firm effects toward zero. To assess validity, we match our estimates of the wage components to out-of-sample estimates of revenue per worker. The corrected estimates attribute much more of the variation in revenue per worker to variation in match quality and worker quality than the uncorrected estimates.}, url = {http://digitalcommons.ilr.cornell.edu/ldi/28/}, author = {John M. Abowd and Kevin L. Mckinney and Ian M. Schmutte} } @article {2236, title = {Multiple imputation of missing categorical and continuous outcomes via Bayesian mixture models with local dependence}, journal = {Journal of the American Statistical Association}, volume = {111}, year = {2017}, month = {01/2017}, pages = {1466 {\textendash} 1479}, abstract = {We present a nonparametric Bayesian joint model for multivariate continuous and categorical variables, with the intention of developing a flexible engine for multiple imputation of missing values. The model fuses Dirichlet process mixtures of multinomial distributions for categorical variables with Dirichlet process mixtures of multivariate normal distributions for continuous variables. We incorporate dependence between the continuous and categorical variables by (i) modeling the means of the normal distributions as component-specific functions of the categorical variables and (ii) forming distinct mixture components for the categorical and continuous data with probabilities that are linked via a hierarchical model. This structure allows the model to capture complex dependencies between the categorical and continuous data with minimal tuning by the analyst. We apply the model to impute missing values due to item nonresponse in an evaluation of the redesign of the Survey of Income and Program Participation (SIPP). The goal is to compare estimates from a field test with the new design to estimates from selected individuals from a panel collected under the old design. We show that accounting for the missing data changes some conclusions about the comparability of the distributions in the two datasets. We also perform an extensive repeated sampling simulation using similar data from complete cases in an existing SIPP panel, comparing our proposed model to a default application of multiple imputation by chained equations. Imputations based on the proposed model tend to have better repeated sampling properties than the default application of chained equations in this realistic setting.}, keywords = {Hierarchical mixture model, Missing data, Nonparametric Bayes, Stick-breaking process}, author = {J. S. Murray and J. P. Reiter} } @article {2656, title = {Multi-rubric Models for Ordinal Spatial Data with Application to Online Ratings from Yelp}, year = {2017}, abstract = {Interest in online rating data has increased in recent years. Such data consists of ordinal ratings of products or local businesses provided by users of a website, such as \Yelp\ or \texttt{Amazon}. One source of heterogeneity in ratings is that users apply different standards when supplying their ratings; even if two users benefit from a product the same amount, they may translate their benefit into ratings in different ways. In this article we propose an ordinal data model, which we refer to as a multi-rubric model, which treats the criteria used to convert a latent utility into a rating as user-specific random effects, with the distribution of these random effects being modeled nonparametrically. We demonstrate that this approach is capable of accounting for this type of variability in addition to usual sources of heterogeneity due to item quality, user biases, interactions between items and users, and the spatial structure of the users and items. We apply the model developed here to publicly available data from the website \Yelp\ and demonstrate that it produces interpretable clusterings of users according to their rating behavior, in addition to providing better predictions of ratings and better summaries of overall item quality.}, keywords = {Bayesian hierarchical model, Data augmentation, Nonparametric Bayes, ordinal data, recommender systems, spatial prediction.}, url = {https://arxiv.org/abs/1706.03012}, author = {Linero, A.R. and Bradley, J.R. and Desai, A.} } @techreport {handle:1813:52163, title = {NCRN Meeting Spring 2017}, number = {1813:52163}, year = {2017}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2017 Vilhuber, Lars}, url = {http://hdl.handle.net/1813/52163}, author = {Vilhuber, Lars} } @techreport {handle:1813:52164, title = {NCRN Meeting Spring 2017: Formal Privacy Models and Title 13}, number = {1813:52164}, year = {2017}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2017: Formal Privacy Models and Title 13 Nissim, Kobbi; Gasser, Urs; Smith, Adam; Vadhan, Salil; O{\textquoteright}Brien, David; Wood, Alexandra A new collaboration between academia and the Census Bureau to further the Bureau{\textquoteright}s use of formal privacy models.}, url = {http://hdl.handle.net/1813/52164}, author = {Nissim, Kobbi and Gasser, Urs and Smith, Adam and Vadhan, Salil and O{\textquoteright}Brien, David and Wood, Alexandra} } @techreport {handle:1813:52163, title = {NCRN Meeting Spring 2017: Welcome}, number = {1813:52163}, year = {2017}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2017: Welcome Vilhuber, Lars}, url = {http://hdl.handle.net/1813/52163}, author = {Vilhuber, Lars} } @techreport {handle:1813:46686, title = {NCRN Newsletter: Volume 3 - Issue 3}, number = {1813:46686}, year = {2017}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Newsletter: Volume 3 - Issue 3 Vilhuber, Lars; Knight-Ingram, Dory Overview of activities at NSF-Census Research Network nodes from December 2016 through February 2017. NCRN Newsletter Vol. 3, Issue 3: March 10, 2017}, url = {http://hdl.handle.net/1813/46686}, author = {Vilhuber, Lars and Knight-Ingram, Dory} } @techreport {handle:1813:52259, title = {NCRN Newsletter: Volume 3 - Issue 4}, number = {1813:52259}, year = {2017}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Newsletter: Volume 3 - Issue 4 Vilhuber, Lars; Knight-Ingram, Dory The NCRN Newsletter is published quarterly by the NCRN Coordinating Office.}, url = {http://hdl.handle.net/1813/52259}, author = {Vilhuber, Lars and Knight-Ingram, Dory} } @techreport {handle:1813:52656, title = {Presentation: Introduction to Stan for Markov Chain Monte Carlo}, number = {1813:52656}, year = {2017}, institution = {University of Missouri}, type = {Preprint}, abstract = {Presentation: Introduction to Stan for Markov Chain Monte Carlo Simpson, Matthew An introduction to Stan (http://mc-stan.org/): a probabilistic programming language that implements Hamiltonian Monte Carlo (HMC), variational Bayes, and (penalized) maximum likelihood estimation. Presentation given at the U.S. Census Bureau on April 25, 2017.}, url = {http://hdl.handle.net/1813/52656}, author = {Simpson, Matthew} } @techreport {handle:1813:46197, title = {Proceedings from the 2016 NSF{\textendash}Sloan Workshop on Practical Privacy}, number = {1813:46197}, year = {2017}, institution = {Cornell University}, type = {Preprint}, abstract = {Proceedings from the 2016 NSF{\textendash}Sloan Workshop on Practical Privacy Vilhuber, Lars; Schmutte, Ian On October 14, 2016, we hosted a workshop that brought together economists, survey statisticians, and computer scientists with expertise in the field of privacy preserving methods: Census Bureau staff working on implementing cutting-edge methods in the Bureau{\textquoteright}s flagship public-use products mingled with academic researchers from a variety of universities. The four products discussed as part of the workshop were 1. the American Community Survey (ACS); 2. Longitudinal Employer-Household Data (LEHD), in particular the LEHD Origin-Destination Employment Statistics (LODES); the 3. 2020 Decennial Census; and the 4. 2017 Economic Census. The goal of the workshop was to 1. Discuss the specific challenges that have arisen in ongoing efforts to apply formal privacy models to Census data products by drawing together expertise of academic and governmental researchers 2. Produce short written memos that summarize concrete suggestions for practical applications to specific Census Bureau priority areas.}, url = {http://hdl.handle.net/1813/46197}, author = {Vilhuber, Lars and Schmutte, Ian} } @techreport {handle:1813:52473, title = {Proceedings from the 2017 Cornell-Census- NSF- Sloan Workshop on Practical Privacy}, number = {1813:52473}, year = {2017}, institution = {Cornell University}, type = {Preprint}, abstract = {Proceedings from the 2017 Cornell-Census- NSF- Sloan Workshop on Practical Privacy Vilhuber, Lars; Schmutte, Ian M. ese proceedings report on a workshop hosted at the U.S. Census Bureau on May 8, 2017. Our purpose was to gather experts from various backgrounds together to continue discussing the development of formal privacy systems for Census Bureau data products. is workshop was a successor to a previous workshop held in October 2016 (Vilhuber \& Schmu e 2017). At our prior workshop, we hosted computer scientists, survey statisticians, and economists, all of whom were experts in data privacy. At that time we discussed the practical implementation of cu ing-edge methods for publishing data with formal, provable privacy guarantees, with a focus on applications to Census Bureau data products. e teams developing those applications were just starting out when our rst workshop took place, and we spent our time brainstorming solutions to the various problems researchers were encountering, or anticipated encountering. For these cu ing-edge formal privacy models, there had been very li le e ort in the academic literature to apply those methods in real-world se ings with large, messy data. We therefore brought together an expanded group of specialists from academia and government who could shed light on technical challenges, subject ma er challenges and address how data users might react to changes in data availability and publishing standards. In May 2017, we organized a follow-up workshop, which these proceedings report on. We reviewed progress made in four di erent areas. e four topics discussed as part of the workshop were 1. the 2020 Decennial Census; 2. the American Community Survey (ACS); 3. the 2017 Economic Census; 4. measuring the demand for privacy and for data quality. As in our earlier workshop, our goals were to 1. Discuss the speci c challenges that have arisen in ongoing e orts to apply formal privacy models to Census data products by drawing together expertise of academic and governmental researchers; 2. Produce short wri en memos that summarize concrete suggestions for practical applications to speci c Census Bureau priority areas. Comments can be provided at h ps://goo.gl/ZAh3YE}, url = {http://hdl.handle.net/1813/52473}, author = {Vilhuber, Lars and Schmutte, Ian M.} } @techreport {handle:1813:52472, title = {Proceedings from the Synthetic LBD International Seminar}, number = {1813:52472}, year = {2017}, institution = {Cornell University}, type = {Preprint}, abstract = {Proceedings from the Synthetic LBD International Seminar Vilhuber, Lars; Kinney, Saki; Schmutte, Ian M. On May 9, 2017, we hosted a seminar to discuss the conditions necessary to implement the SynLBD approach with interested parties, with the goal of providing a straightforward toolkit to implement the same procedure on other data. The proceedings summarize the discussions during the workshop.}, url = {http://hdl.handle.net/1813/52472}, author = {Vilhuber, Lars and Kinney, Saki and Schmutte, Ian M.} } @techreport {handle:1813:52649, title = {Recalculating - How Uncertainty in Local Labor Market Definitions Affects Empirical Findings}, number = {1813:52649}, year = {2017}, institution = {Cornell University}, type = {Preprint}, abstract = {Recalculating - How Uncertainty in Local Labor Market Definitions Affects Empirical Findings Foote, Andrew; Kutzbach, Mark J.; Vilhuber, Lars This paper evaluates the use of commuting zones as a local labor market definition. We revisit Tolbert and Sizer (1996) and demonstrate the sensitivity of definitions to two features of the methodology. We show how these features impact empirical estimates using a well-known application of commuting zones. We conclude with advice to researchers using commuting zones on how to demonstrate the robustness of empirical findings to uncertainty in definitions. The analysis, conclusions, and opinions expressed herein are those of the author(s) alone and do not necessarily represent the views of the U.S. Census Bureau or the Federal Deposit Insurance Corporation. All results have been reviewed to ensure that no confidential information is disclosed, and no confidential data was used in this paper. This document is released to inform interested parties of ongoing research and to encourage discussion of work in progress. Much of the work developing this paper occurred while Mark Kutzbach was an employee of the U.S. Census Bureau.}, url = {http://hdl.handle.net/1813/52649}, author = {Foote, Andrew and Kutzbach, Mark J. and Vilhuber, Lars} } @article {2657, title = {Regionalization of Multiscale Spatial Processes using a Criterion for Spatial Aggregation Error}, journal = {Journal of the Royal Statistical Society -- Series B.}, year = {2017}, abstract = {The modifiable areal unit problem and the ecological fallacy are known problems that occur when modeling multiscale spatial processes. We investigate how these forms of spatial aggregation error can guide a regionalization over a spatial domain of interest. By "regionalization" we mean a specification of geographies that define the spatial support for areal data. This topic has been studied vigorously by geographers, but has been given less attention by spatial statisticians. Thus, we propose a criterion for spatial aggregation error (CAGE), which we minimize to obtain an optimal regionalization. To define CAGE we draw a connection between spatial aggregation error and a new multiscale representation of the Karhunen-Loeve (K-L) expansion. This relationship between CAGE and the multiscale K-L expansion leads to illuminating theoretical developments including: connections between spatial aggregation error, squared prediction error, spatial variance, and a novel extension of Obled-Creutin eigenfunctions. The effectiveness of our approach is demonstrated through an analysis of two datasets, one using the American Community Survey and one related to environmental ocean winds.}, keywords = {American Community Survey, empirical orthogonal functions, MAUP, Reduced rank, Spatial basis functions, Survey data}, url = {https://arxiv.org/abs/1502.01974}, author = {Bradley, J.R. and Wikle, C.K. and Holan, S.H.} } @techreport {2567, title = {Revisiting the Economics of Privacy: Population Statistics and Confidentiality Protection as Public Goods}, number = {37}, year = {2017}, month = {04/2017}, abstract = {We consider the problem of determining the optimal accuracy of public statistics when increased accuracy requires a loss of privacy. To formalize this allocation problem, we use tools from statistics and computer science to model the publication technology used by a public statistical agency. We derive the demand for accurate statistics from first principles to generate interdependent preferences that account for the public-good nature of both data accuracy and privacy loss. We first show data accuracy is inefficiently under-supplied by a private provider. Solving the appropriate social planner{\textquoteright}s problem produces an implementable publication strategy. We implement the socially optimal publication plan for statistics on income and health status using data from the American Community Survey, National Health Interview Survey, Federal Statistical System Public Opinion Survey and Cornell National Social Survey. Our analysis indicates that welfare losses from providing too much privacy protection and, therefore, too little accuracy can be substantial.}, url = {http://digitalcommons.ilr.cornell.edu/ldi/37/}, author = {John M. Abowd and Ian M. Schmutte} } @techreport {handle:1813:39081, title = {Revisiting the Economics of Privacy: Population Statistics and Confidentiality Protection as Public Goods}, number = {1813:39081}, year = {2017}, institution = {Cornell University}, type = {Preprint}, abstract = {Revisiting the Economics of Privacy: Population Statistics and Confidentiality Protection as Public Goods Abowd, John; Schmutte, Ian M. We consider the problem of the public release of statistical information about a population{\textendash}explicitly accounting for the public-good properties of both data accuracy and privacy loss. We first consider the implications of adding the public-good component to recently published models of private data publication under differential privacy guarantees using a Vickery-Clark-Groves mechanism and a Lindahl mechanism. We show that data quality will be inefficiently under-supplied. Next, we develop a standard social planner{\textquoteright}s problem using the technology set implied by (ε, δ)-differential privacy with (α, β)-accuracy for the Private Multiplicative Weights query release mechanism to study the properties of optimal provision of data accuracy and privacy loss when both are public goods. Using the production possibilities frontier implied by this technology, explicitly parameterized interdependent preferences, and the social welfare function, we display properties of the solution to the social planner{\textquoteright}s problem. Our results directly quantify the optimal choice of data accuracy and privacy loss as functions of the technology and preference parameters. Some of these properties can be quantified using population statistics on marginal preferences and correlations between income, data accuracy preferences, and privacy loss preferences that are available from survey data. Our results show that government data custodians should publish more accurate statistics with weaker privacy guarantees than would occur with purely private data publishing. Our statistical results using the General Social Survey and the Cornell National Social Survey indicate that the welfare losses from under-providing data accuracy while over-providing privacy protection can be substantial. A complete archive of the data and programs used in this paper is available via http://doi.org/10.5281/zenodo.345385.}, url = {http://hdl.handle.net/1813/39081}, author = {Abowd, John and Schmutte, Ian M.} } @techreport {handle:1813:52612, title = {Revisiting the Economics of Privacy: Population Statistics and Confidentiality Protection as Public Goods}, number = {1813:52612}, year = {2017}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {Revisiting the Economics of Privacy: Population Statistics and Confidentiality Protection as Public Goods Abowd, John; Schmutte, Ian M. We consider the problem of determining the optimal accuracy of public statistics when increased accuracy requires a loss of privacy. To formalize this allocation problem, we use tools from statistics and computer science to model the publication technology used by a public statistical agency. We derive the demand for accurate statistics from first principles to generate interdependent preferences that account for the public-good nature of both data accuracy and privacy loss. We first show data accuracy is inefficiently under-supplied by a private provider. Solving the appropriate social planner{\textquoteright}s problem produces an implementable publication strategy. We implement the socially optimal publication plan for statistics on income and health status using data from the American Community Survey, National Health Interview Survey, Federal Statistical System Public Opinion Survey and Cornell National Social Survey. Our analysis indicates that welfare losses from providing too much privacy protection and, therefore, too little accuracy can be substantial.}, url = {http://hdl.handle.net/1813/52612}, author = {Abowd, John and Schmutte, Ian M.} } @article {2560, title = {The role of statistical disclosure limitation in total survey error}, journal = {Total Survey Error in Practice}, year = {2017}, pages = {71 {\textendash} 94}, abstract = {This chapter presents the thesis, which is statistical disclosure limitation (SDL) that ought to be viewed as an integral component of total survey error (TSE). TSE and SDL will move forward together, but integrating multiple criteria: cost, risk, data quality, and decision quality. The chapter explores the value of unifying two key TSE procedures - editing and imputation - with SDL. It discusses {\textquotedblleft}Big data{\textquotedblright} issues, which contains a mathematical formulation that, at least conceptually and at some point in the future, does unify TSE and SDL. Modern approaches to SDL are based explicitly or implicitly on tradeoffs between disclosure risk and data utility. There are three principal classes of SDL methods: reduction/coarsening techniques; perturbative methods; and synthetic data methods. Data swapping is among the most frequently applied SDL methods for categorical data. The chapter sketches how it can be informed by knowledge of TSE.}, keywords = {big data issues, data quality, data swapping, decision quality, risk-utility paradigms, Statistical Disclosure Limitation, total survey error}, doi = {10.1002/9781119041702.ch4}, author = {A. F. Karr} } @booklet {2502, title = {Sequential Prediction of Respondent Behaviors Leading to Error in Web-based Surveys}, year = {2017}, author = {Eck, Adam and Soh, Leen-Kiat} } @techreport {ldi40, title = {Sorting Between and Within Industries: A Testable Model of Assortative Matching}, number = {40}, year = {2017}, institution = {Labor Dynamics Institute}, type = {Document}, abstract = {We test Shimer{\textquoteright}s (2005) theory of the sorting of workers between and within industrial sectors based on directed search with coordination frictions, deliberately maintaining its static general equilibrium framework. We fit the model to sector-specific wage, vacancy and output data, including publicly-available statistics that characterize the distribution of worker and employer wage heterogeneity across sectors. Our empirical method is general and can be applied to a broad class of assignment models. The results indicate that industries are the loci of sorting{\textendash}more productive workers are employed in more productive industries. The evidence confirms that strong assortative matching can be present even when worker and employer components of wage heterogeneity are weakly correlated.}, url = {http://digitalcommons.ilr.cornell.edu/ldi/40/}, author = {John M. Abowd and Francis Kramarz and Sebastien Perez-Duarte and Ian M. Schmutte} } @article {2563, title = {Stop or continue data collection: A nonignorable missing data approach for continuous variables}, journal = {Journal of Official Statistics}, year = {2017}, abstract = {We present an approach to inform decisions about nonresponse follow-up sampling. The basic idea is (i) to create completed samples by imputing nonrespondents{\textquoteright} data under various assumptions about the nonresponse mechanisms, (ii) take hypothetical samples of varying sizes from the completed samples, and (iii) compute and compare measures of accuracy and cost for different proposed sample sizes. As part of the methodology, we present a new approach for generating imputations for multivariate continuous data with nonignorable unit nonresponse. We fit mixtures of multivariate normal distributions to the respondents{\textquoteright} data, and adjust the probabilities of the mixture components to generate nonrespondents{\textquoteright} distributions with desired features. We illustrate the approaches using data from the 2007 U. S. Census of Manufactures.}, author = {T. Paiva and J. P. Reiter} } @techreport {handle:1813:52611, title = {Two Perspectives on Commuting: A Comparison of Home to Work Flows Across Job-Linked Survey and Administrative Files}, number = {1813:52611}, year = {2017}, institution = {Cornell University}, type = {Preprint}, abstract = {Two Perspectives on Commuting: A Comparison of Home to Work Flows Across Job-Linked Survey and Administrative Files Green, Andrew; Kutzbach, Mark J.; Vilhuber, Lars Commuting flows and workplace employment data have a wide constituency of users including urban and regional planners, social science and transportation researchers, and businesses. The U.S. Census Bureau releases two, national data products that give the magnitude and characteristics of home to work flows. The American Community Survey (ACS) tabulates households{\textquoteright} responses on employment, workplace, and commuting behavior. The Longitudinal Employer-Household Dynamics (LEHD) program tabulates administrative records on jobs in the LEHD Origin-Destination Employment Statistics (LODES). Design differences across the datasets lead to divergence in a comparable statistic: county-to-county aggregate commute flows. To understand differences in the public use data, this study compares ACS and LEHD source files, using identifying information and probabilistic matching to join person and job records. In our assessment, we compare commuting statistics for job frames linked on person, employment status, employer, and workplace and we identify person and job characteristics as well as design features of the data frames that explain aggregate differences. We find a lower rate of within-county commuting and farther commutes in LODES. We attribute these greater distances to differences in workplace reporting and to uncertainty of establishment assignments in LEHD for workers at multi-unit employers. Minor contributing factors include differences in residence location and ACS workplace edits. The results of this analysis and the data infrastructure developed will support further work to understand and enhance commuting statistics in both datasets.}, url = {http://hdl.handle.net/1813/52611}, author = {Green, Andrew and Kutzbach, Mark J. and Vilhuber, Lars} } @techreport {2017arXiv171002690C, title = {{Unique Entity Estimation with Application to the Syrian Conflict}}, number = {1710.02690}, year = {2017}, abstract = {Entity resolution identifies and removes duplicate entities in large, noisy databases and has grown in both usage and new developments as a result of increased data availability. Nevertheless, entity resolution has tradeoffs regarding assumptions of the data generation process, error rates, and computational scalability that make it a difficult task for real applications. In this paper, we focus on a related problem of unique entity estimation, which is the task of estimating the unique number of entities and associated standard errors in a data set with duplicate entities. Unique entity estimation shares many fundamental challenges of entity resolution, namely, that the computational cost of all-to-all entity comparisons is intractable for large databases. To circumvent this computational barrier, we propose an efficient (near-linear time) estimation algorithm based on locality sensitive hashing. Our estimator, under realistic assumptions, is unbiased and has provably low variance compared to existing random sampling based approaches. In addition, we empirically show its superiority over the state-of-the-art estimators on three real applications. The motivation for our work is to derive an accurate estimate of the documented, identifiable deaths in the ongoing Syrian conflict. Our methodology, when applied to the Syrian data set, provides an estimate of $191,874 \pm 1772$ documented, identifiable deaths, which is very close to the Human Rights Data Analysis Group (HRDAG) estimate of 191,369. Our work provides an example of challenges and efforts involved in solving a real, noisy challenging problem where modeling assumptions may not hold. }, keywords = {Computer Science - Data Structures and Algorithms, Computer Science - Databases, Statistics - Applications}, url = {https://arxiv.org/abs/1710.02690}, author = {Chen, B. and Shrivastava, A. and Steorts, R.~C.} } @article {2541, title = {Utility Cost of Formal Privacy for Releasing National Employer-Employee Statistics}, journal = {Proceedings of the 2017 ACM International Conference on Management of Data}, year = {2017}, abstract = {National statistical agencies around the world publish tabular summaries based on combined employer-employee (ER-EE) data. The privacy of both individuals and business establishments that feature in these data are protected by law in most countries. These data are currently released using a variety of statistical disclosure limitation (SDL) techniques that do not reveal the exact characteristics of particular employers and employees, but lack provable privacy guarantees limiting inferential disclosures. In this work, we present novel algorithms for releasing tabular summaries of linked ER-EE data with formal, provable guarantees of privacy. We show that state-of-the-art differentially private algorithms add too much noise for the output to be useful. Instead, we identify the privacy requirements mandated by current interpretations of the relevant laws, and formalize them using the Pufferfish framework. We then develop new privacy definitions that are customized to ER-EE data and satisfy the statutory privacy requirements. We implement the experiments in this paper on production data gathered by the U.S. Census Bureau. An empirical evaluation of utility for these data shows that for reasonable values of the privacy-loss parameter ε>= 1, the additive error introduced by our provably private algorithms is comparable, and in some cases better, than the error introduced by existing SDL techniques that have no provable privacy guarantees. For some complex queries currently published, however, our algorithms do not have utility comparable to the existing traditional SDL algorithms. Those queries are fodder for future research.}, isbn = { 978-1-4503-4197-4 }, doi = {10.1145/3035918.3035940}, url = {http://dl.acm.org/citation.cfm?doid=3035918.3035940}, author = {Samuel Haney and Ashwin Machanavajjhala and John M. Abowd and Matthew Graham and Mark Kutzbach} } @techreport {handle:1813:49652, title = {Utility Cost of Formal Privacy for Releasing National Employer-Employee Statistics}, number = {1813:49652}, year = {2017}, institution = {Cornell University}, type = {Preprint}, abstract = {Utility Cost of Formal Privacy for Releasing National Employer-Employee Statistics Haney, Samuel; Machanavajjhala, Ashwin; Abowd, John M; Graham, Matthew; Kutzbach, Mark; Vilhuber, Lars National statistical agencies around the world publish tabular summaries based on combined employeremployee (ER-EE) data. The privacy of both individuals and business establishments that feature in these data are protected by law in most countries. These data are currently released using a variety of statistical disclosure limitation (SDL) techniques that do not reveal the exact characteristics of particular employers and employees, but lack provable privacy guarantees limiting inferential disclosures. In this work, we present novel algorithms for releasing tabular summaries of linked ER-EE data with formal, provable guarantees of privacy. We show that state-of-the-art differentially private algorithms add too much noise for the output to be useful. Instead, we identify the privacy requirements mandated by current interpretations of the relevant laws, and formalize them using the Pufferfish framework. We then develop new privacy definitions that are customized to ER-EE data and satisfy the statutory privacy requirements. We implement the experiments in this paper on production data gathered by the U.S. Census Bureau. An empirical evaluation of utility for these data shows that for reasonable values of the privacy-loss parameter ϵ>=1, the additive error introduced by our provably private algorithms is comparable, and in some cases better, than the error introduced by existing SDL techniques that have no provable privacy guarantees. For some complex queries currently published, however, our algorithms do not have utility comparable to the existing traditional}, url = {http://hdl.handle.net/1813/49652}, author = {Haney, Samuel and Machanavajjhala, Ashwin and Abowd, John M and Graham, Matthew and Kutzbach, Mark and Vilhuber, Lars} } @article {2660, title = {Visualizing uncertainty in areal data estimates with bivariate choropleth maps, map pixelation, and glyph rotation}, journal = {Stat}, volume = {6}, year = {2017}, pages = {292{\textendash}302}, abstract = {In statistics, we quantify uncertainty to help determine the accuracy of estimates, yet this crucial piece of information is rarely included on maps visualizing areal data estimates. We develop and present three approaches to include uncertainty on maps: (1) the bivariate choropleth map repurposed to visualize uncertainty; (2) the pixelation of counties to include values within an estimate{\textquoteright}s margin of error; and (3) the rotation of a glyph, located at a county{\textquoteright}s centroid, to represent an estimate{\textquoteright}s uncertainty. The second method is presented as both a static map and visuanimation. We use American Community Survey estimates and their corresponding margins of error to demonstrate the methods and highlight the importance of visualizing uncertainty in areal data. An extensive online supplement provides the R code necessary to produce the maps presented in this article as well as alternative versions of them. }, url = {http://onlinelibrary.wiley.com/doi/10.1002/sta4.150/abstract}, author = {Lucchesi, L.R. and Wikle, C.K.} } @techreport {handle:1813:52165, title = {2017 Economic Census: Towards Synthetic Data Sets}, number = {1813:52165}, year = {2016}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {2017 Economic Census: Towards Synthetic Data Sets Caldwell, Carol; Thompson, Katherine Jenny}, url = {http://hdl.handle.net/1813/52165}, author = {Caldwell, Carol and Thompson, Katherine Jenny} } @article {2192, title = {Assessing disclosure risks for synthetic data with arbitrary intruder knowledge}, journal = {Statistical Journal of the International Association for Official Statistics}, volume = {32}, year = {2016}, month = {02/2016}, pages = {109-126}, chapter = {109}, abstract = { Several statistical agencies release synthetic microdata, i.e., data with all confidential values replaced with draws from statistical models, in order to protect data subjects{\textquoteright} confidentiality. While fully synthetic data are safe from record linkage attacks, intruders might be able to use the released synthetic values to estimate confidential values for individuals in the collected data. We demonstrate and investigate this potential risk using two simple but informative scenarios: a single continuous variable possibly with outliers, and a three-way contingency table possibly with small counts in some cells. Beginning with the case that the intruder knows all but one value in the confidential data, we examine the effect on risk of decreasing the number of observations the intruder knows beforehand. We generally find that releasing synthetic data (1) can pose little risk to records in the middle of the distribution, and (2) can pose some risks to extreme outliers, although arguably these risks are mild. We also find that the effect of removing observations from an intruder{\textquoteright}s background knowledge heavily depends on how well that intruder can fill in those missing observations: the risk remains fairly constant if he/she can fill them in well, and drops quickly if he/she cannot. }, keywords = {confidentiality, Disclosure, risk, synthetic}, doi = {10.3233/SJI-160957}, url = {http://content.iospress.com/download/statistical-journal-of-the-iaos/sji957}, author = {McClure, D. and Reiter , J. P.} } @article {deyoreo:kottas:sc, title = {A {B}ayesian nonparametric {M}arkovian model for nonstationary time series}, journal = {Statistics and Computing}, year = {2016}, month = {01/2016}, chapter = {1}, abstract = {Stationary time series models built from parametric distributions are, in general, limited in scope due to the assumptions imposed on the residual distribution and autoregression relationship. We present a modeling approach for univariate time series data, which makes no assumptions of stationarity, and can accommodate complex dynamics and capture nonstandard distributions. The model for the transition density arises from the conditional distribution implied by a Bayesian nonparametric mixture of bivariate normals. This implies a flexible autoregressive form for the conditional transition density, defining a time-homogeneous, nonstationary, Markovian model for real-valued data indexed in discrete-time. To obtain a more computationally tractable algorithm for posterior inference, we utilize a square-root-free Cholesky decomposition of the mixture kernel covariance matrix. Results from simulated data suggest the model is able to recover challenging transition and predictive densities. We also illustrate the model on time intervals between eruptions of the Old Faithful geyser. Extensions to accommodate higher order structure and to develop a state-space model are also discussed.}, keywords = {Autoregressive Models, Bayesian Nonparametrics, Dirichlet Process Mixtures, Markov chain Monte Carlo, Nonstationarity, Time Series}, author = {De Yoreo, M. and Kottas, A.} } @article {doi:10.1080/01621459.2015.1105807, title = {A Bayesian Approach to Graphical Record Linkage and Deduplication}, journal = {Journal of the American Statistical Association}, volume = {111}, number = {516}, year = {2016}, pages = {1660-1672}, abstract = {ABSTRACTWe propose an unsupervised approach for linking records across arbitrarily many files, while simultaneously detecting duplicate records within files. Our key innovation involves the representation of the pattern of links between records as a bipartite graph, in which records are directly linked to latent true individuals, and only indirectly linked to other records. This flexible representation of the linkage structure naturally allows us to estimate the attributes of the unique observable people in the population, calculate transitive linkage probabilities across records (and represent this visually), and propagate the uncertainty of record linkage into later analyses. Our method makes it particularly easy to integrate record linkage with post-processing procedures such as logistic regression, capture{\textendash}recapture, etc. Our linkage structure lends itself to an efficient, linear-time, hybrid Markov chain Monte Carlo algorithm, which overcomes many obstacles encountered by previously record linkage approaches, despite the high-dimensional parameter space. We illustrate our method using longitudinal data from the National Long Term Care Survey and with data from the Italian Survey on Household and Wealth, where we assess the accuracy of our method and show it to be better in terms of error rates and empirical scalability than other approaches in the literature. Supplementary materials for this article are available online.}, doi = {10.1080/01621459.2015.1105807}, url = {http://dx.doi.org/10.1080/01621459.2015.1105807}, author = {Rebecca C. Steorts and Rob Hall and Stephen E. Fienberg} } @article {2665, title = {Bayesian Hierarchical Models with Conjugate Full-Conditional Distributions for Dependent Data from the Natural Exponential Family}, journal = {Journal of the American Statistical Association - T\&M.}, year = {2016}, abstract = {We introduce a Bayesian approach for analyzing (possibly) high-dimensional dependent data that are distributed according to a member from the natural exponential family of distributions. This problem requires extensive methodological advancements, as jointly modeling high-dimensional dependent data leads to the so-called "big n problem." The computational complexity of the "big n problem" is further exacerbated when allowing for non-Gaussian data models, as is the case here. Thus, we develop new computationally efficient distribution theory for this setting. In particular, we introduce something we call the "conjugate multivariate distribution," which is motivated by the univariate distribution introduced in Diaconis and Ylvisaker (1979). Furthermore, we provide substantial theoretical and methodological development including: results regarding conditional distributions, an asymptotic relationship with the multivariate normal distribution, conjugate prior distributions, and full-conditional distributions for a Gibbs sampler. The results in this manuscript are extremely general, and can be adapted to many different settings. We demonstrate the proposed methodology through simulated examples and analyses based on estimates obtained from the US Census Bureaus{\textquoteright} American Community Survey (ACS).}, url = {https://arxiv.org/abs/1701.07506}, author = {Bradley, J.R. and Holan, S.H. and Wikle, C.K.} } @article {si:reiter:hillygus16, title = {Bayesian latent pattern mixture models for handling attrition in panel studies with refreshment samples}, journal = {Annals of Applied Statistics}, volume = {10}, year = {2016}, pages = {118-{\textendash}143}, doi = {10.1214/15-AOAS876}, url = {http://projecteuclid.org/euclid.aoas/1458909910}, author = {Y. Si and J. P. Reiter and D. S. Hillygus} } @article {2668, title = {Bayesian Lattice Filters for Time-Varying Autoregression and Time-Frequency Analysis}, journal = {Bayesian Analysis}, year = {2016}, pages = {977-1003}, abstract = {Modeling nonstationary processes is of paramount importance to many scientific disciplines including environmental science, ecology, and finance, among others. Consequently, flexible methodology that provides accurate estimation across a wide range of processes is a subject of ongoing interest. We propose a novel approach to model-based time-frequency estimation using time-varying autoregressive models. In this context, we take a fully Bayesian approach and allow both the autoregressive coefficients and innovation variance to vary over time. Importantly, our estimation method uses the lattice filter and is cast within the partial autocorrelation domain. The marginal posterior distributions are of standard form and, as a convenient by-product of our estimation method, our approach avoids undesirable matrix inversions. As such, estimation is extremely computationally efficient and stable. To illustrate the effectiveness of our approach, we conduct a comprehensive simulation study that compares our method with other competing methods and find that, in most cases, our approach performs superior in terms of average squared error between the estimated and true time-varying spectral density. Lastly, we demonstrate our methodology through three modeling applications; namely, insect communication signals, environmental data (wind components), and macroeconomic data (US gross domestic product (GDP) and consumption).}, url = {https://arxiv.org/abs/1408.2757}, author = {Yang, W.H. and Holan, S.H. and Wikle, C.K.} } @techreport {2391, title = {Bayesian mixture modeling for multivariate conditional distributions}, number = {1606.04457}, year = {2016}, institution = {ArXiv}, abstract = {We present a Bayesian mixture model for estimating the joint distribution of mixed ordinal, nominal, and continuous data conditional on a set of fixed variables. The model uses multivariate normal and categorical mixture kernels for the random variables. It induces dependence between the random and fixed variables through the means of the multivariate normal mixture kernels and via a truncated local Dirichlet process. The latter encourages observations with similar values of the fixed variables to share mixture components. Using a simulation of data fusion, we illustrate that the model can estimate underlying relationships in the data and the distributions of the missing values more accurately than a mixture model applied to the random and fixed variables jointly. We use the model to analyze consumers{\textquoteright} reading behaviors using a quota sample, i.e., a sample where the empirical distribution of some variables is fixed by design and so should not be modeled as random, conducted by the book publisher HarperCollins.}, url = {http://arxiv.org/abs/1606.04457}, author = {Maria DeYoreo and Jerome P. Reiter} } @techreport {2390, title = {A Bayesian nonparametric Markovian model for nonstationary time series}, number = {1601.04331}, year = {2016}, institution = {ArXiv}, abstract = {Stationary time series models built from parametric distributions are, in general, limited in scope due to the assumptions imposed on the residual distribution and autoregression relationship. We present a modeling approach for univariate time series data, which makes no assumptions of stationarity, and can accommodate complex dynamics and capture nonstandard distributions. The model for the transition density arises from the conditional distribution implied by a Bayesian nonparametric mixture of bivariate normals. This implies a flexible autoregressive form for the conditional transition density, defining a time-homogeneous, nonstationary, Markovian model for real-valued data indexed in discrete-time. To obtain a more computationally tractable algorithm for posterior inference, we utilize a square-root-free Cholesky decomposition of the mixture kernel covariance matrix. Results from simulated data suggest the model is able to recover challenging transition and predictive densities. We also illustrate the model on time intervals between eruptions of the Old Faithful geyser. Extensions to accommodate higher order structure and to develop a state-space model are also discussed.}, url = {http://arxiv.org/abs/1601.04331}, author = {Maria DeYoreo and Athanasios Kottas} } @article {hahn:murray:mano, title = {A Bayesian Partial Identification Approach to Inferring the Prevalence of Accounting Misconduct}, journal = {Journal of the American Statistical Association}, volume = {111}, year = {2016}, pages = {14{\textendash}26}, abstract = {This article describes the use of flexible Bayesian regression models for estimating a partially identified probability function. Our approach permits efficient sensitivity analysis concerning the posterior impact of priors on the partially identified component of the regression model. The new methodology is illustrated on an important problem where only partially observed data are available{\textemdash}inferring the prevalence of accounting misconduct among publicly traded U.S. businesses. Supplementary materials for this article are available online.}, doi = {10.1080/01621459.2015.1084307}, url = {http://www.tandfonline.com/doi/full/10.1080/01621459.2015.1084307}, author = {P. R. Hahn and J. S. Murray and I. Manolopoulou} } @article {doi:10.1080/01621459.2016.1231612, title = {Bayesian Simultaneous Edit and Imputation for Multivariate Categorical Data}, journal = {Journal of the American Statistical Association}, year = {2016}, month = {09/2016}, abstract = {In categorical data, it is typically the case that some combinations of variables are theoretically impossible, such as a three year old child who is married or a man who is pregnant. In practice, however, reported values often include such structural zeros due to, for example, respondent mistakes or data processing errors. To purge data of such errors, many statistical organizations use a process known as edit-imputation. The basic idea is first to select reported values to change according to some heuristic or loss function, and second to replace those values with plausible imputations. This two-stage process typically does not fully utilize information in the data when determining locations of errors, nor does it appropriately reflect uncertainty resulting from the edits and imputations. We present an alternative approach to editing and imputation for categorical microdata with structural zeros that addresses these shortcomings. Specifically, we use a Bayesian hierarchical model that couples a stochastic model for the measurement error process with a Dirichlet process mixture of multinomial distributions for the underlying, error free values. The latter model is restricted to have support only on the set of theoretically possible combinations. We illustrate this integrated approach to editing and imputation using simulation studies with data from the 2000 U. S. census, and compare it to a two-stage edit-imputation routine. Supplementary material is available online.}, doi = {10.1080/01621459.2016.1231612}, url = {http://dx.doi.org/10.1080/01621459.2016.1231612}, author = {Daniel Manrique-Vallier and Jerome P. Reiter} } @article {2666, title = {Bayesian Spatial Change of Support for Count-Valued Survey Data with Application to the American Community Survey}, journal = {Journal of the American Statistical Association}, year = {2016}, pages = {472-487}, abstract = {We introduce Bayesian spatial change of support methodology for count-valued survey data with known survey variances. Our proposed methodology is motivated by the American Community Survey (ACS), an ongoing survey administered by the U.S. Census Bureau that provides timely information on several key demographic variables. Specifically, the ACS produces 1-year, 3-year, and 5-year "period-estimates," and corresponding margins of errors, for published demographic and socio-economic variables recorded over predefined geographies within the United States. Despite the availability of these predefined geographies it is often of interest to data users to specify customized user-defined spatial supports. In particular, it is useful to estimate demographic variables defined on "new" spatial supports in "real-time." This problem is known as spatial change of support (COS), which is typically performed under the assumption that the data follows a Gaussian distribution. However, count-valued survey data is naturally non-Gaussian and, hence, we consider modeling these data using a Poisson distribution. Additionally, survey-data are often accompanied by estimates of error, which we incorporate into our analysis. We interpret Poisson count-valued data in small areas as an aggregation of events from a spatial point process. This approach provides us with the flexibility necessary to allow ACS users to consider a variety of spatial supports in "real-time." We demonstrate the effectiveness of our approach through a simulated example as well as through an analysis using public-use ACS data.}, url = {https://arxiv.org/abs/1405.7227}, author = {Bradley, J.R. and Wikle, C.K. and Holan, S.H.} } @article {fosdick:deyoreo:reiter, title = {Categorical data fusion using auxiliary information}, journal = {Annals of Applied Statistics}, volume = {10}, number = {4}, year = {2016}, pages = {1907 {\textendash} 1929}, abstract = {In data fusion analysts seek to combine information from two databases comprised of disjoint sets of individuals, in which some variables appear in both databases and other variables appear in only one database. Most data fusion techniques rely on variants of conditional independence assumptions. When inappropriate, these assumptions can result in unreliable inferences. We propose a data fusion technique that allows analysts to easily incorporate auxiliary information on the dependence structure of variables not observed jointly; we refer to this auxiliary information as glue. With this technique, we fuse two marketing surveys from the book publisher HarperCollins using glue from the online, rapid-response polling company CivicScience. The fused data enable estimation of associations between people{\textquoteright}s preferences for authors and for learning about new books. The analysis also serves as a case study on the potential for using online surveys to aid data fusion.}, keywords = {Imputation, Integration, Latent Class, Matching}, doi = {10.1214/16-AOAS925}, url = {http://projecteuclid.org/euclid.aoas/1483606845}, author = {B. K. Fosdick and M. De Yoreo and J. P. Reiter} } @article {2670, title = {Computation of the Autocovariances for Time Series with Multiple Long-Range Persistencies}, journal = {Computational Statistics and Data Analysis}, year = {2016}, pages = {44 - 56}, abstract = {Gegenbauer processes allow for flexible and convenient modeling of time series data with multiple spectral peaks, where the qualitative description of these peaks is via the concept of cyclical long-range dependence. The Gegenbauer class is extensive, including ARFIMA, seasonal ARFIMA, and GARMA processes as special cases. Model estimation is challenging for Gegenbauer processes when multiple zeros and poles occur in the spectral density, because the autocovariance function is laborious to compute. The method of splitting{\textendash}essentially computing autocovariances by convolving long memory and short memory dynamics{\textendash}is only tractable when a single long memory pole exists. An additive decomposition of the spectrum into a sum of spectra is proposed, where each summand has a single singularity, so that a computationally efficient splitting method can be applied to each term and then aggregated. This approach differs from handling all the poles in the spectral density at once, via an analysis of truncation error. The proposed technique allows for fast estimation of time series with multiple long-range dependences, which is illustrated numerically and through several case-studies.}, url = {http://www.sciencedirect.com/science/article/pii/S0167947316300202}, author = {McElroy, T.S. and Holan, S.H.} } @booklet {2532, title = {Data management and analytic use of paradata: SIPP-EHC audit trails}, year = {2016}, author = {Lee, Jinyoung and Seloske, Ben and C{\'o}rdova Cazar, Ana Luc{\'\i}a and Eck, Adam and Kirchner, Antje and Belli, Robert F.} } @article {2241, title = {Differentially private publication of data on wages and job mobility}, journal = {Statistical Journal of the International Association for Official Statistics}, volume = {32}, year = {2016}, month = {02/2016/2016}, pages = {81-92}, chapter = {81}, abstract = {Brazil, like many countries, is reluctant to publish business-level data, because of legitimate concerns about the establishments{\textquoteright} confidentiality. A trusted data curator can increase the utility of data, while managing the risk to establishments, either by releasing synthetic data, or by infusing noise into published statistics. This paper evaluates the application of a differentially private mechanism to publish statistics on wages and job mobility computed from Brazilian employer-employee matched data. The publication mechanism can result in both the publication of specific statistics as well as the generation of synthetic data. I find that the tradeoff between the privacy guaranteed to individuals in the data, and the accuracy of published statistics, is potentially much better that the worst-case theoretical accuracy guarantee. However, the synthetic data fare quite poorly in analyses that are outside the set of queries to which it was trained. Note that this article only explores and characterizes the feasibility of these publication strategies, and will not directly result in the publication of any data. }, keywords = {Demand for public statistics, differential privacy, job mobility, matched employer-employee data, optimal confidentiality protection, optimal data accuracy, technology for statistical agencies}, doi = {10.3233/SJI-160962}, url = {http://content.iospress.com/articles/statistical-journal-of-the-iaos/sji962}, author = {Schmutte, Ian M.} } @techreport {handle:1813:52167, title = {Differentially Private Verification of Regression Model Results}, number = {1813:52167}, year = {2016}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {Differentially Private Verification of Regression Model Results Reiter, Jerry}, url = {http://hdl.handle.net/1813/52167}, author = {Reiter, Jerry} } @article {2509, title = {Do Interviewers with High Cooperation Rates Behave Differently? Interviewer Cooperation Rates and Interview Behaviors}, journal = {Survey Practice}, volume = {9}, year = {2016}, month = {2016}, pages = {no pp.}, abstract = {Interviewers are required to be flexible in responding to respondent concerns during recruitment, but standardized during administration of the questionnaire. These skill sets may be at odds. Recent research has shown a U-shaped relationship between interviewer cooperation rates and interviewer variance: the least and the most successful interviewers during recruitment have the largest interviewer variance components. Little is known about why this association occurs. We posit four hypotheses for this association: 1) interviewers with higher cooperation rates more conscientious interviewers altogether, 2) interviewers with higher cooperation rates continue to use rapport behaviors from the cooperation request throughout an interview, 3) interviewers with higher cooperation rates display more confidence which translates into different interview behavior, and 4) interviewers with higher cooperation rates continue their flexible interviewing style throughout the interview and deviate more from standardized interviewing. We use behavior codes from the Work and Leisure Today Survey (n=450, AAPOR RR3=6.3\%) to evaluate interviewer behavior. Our results largely support the confidence hypothesis. Interviewers with higher cooperation rates do not show evidence of being {\textquotedblleft}better{\textquotedblright} interviewers.}, url = {http://www.surveypractice.org/index.php/SurveyPractice/article/view/351}, author = {Olson, Kristen and Kirchner, Antje and Smyth, Jolene D.} } @techreport {2571, title = {Estimating Compensating Wage Differentials with Endogenous Job Mobility}, year = {2016}, abstract = {We demonstrate a strategy for using matched employer-employee data to correct endogenous job mobility bias when estimating compensating wage differentials. Applied to fatality rates in the census of formal-sector jobs in Brazil between 2003-2010, we show why common approaches to eliminating ability bias can greatly amplify endogenous job mobility bias. By extending the search-theoretic hedonic wage frame- work, we establish conditions necessary to interpret our estimates as preferences. We present empirical analyses supporting the predictions of the model and identifying conditions, demonstrating that the standard models are misspecified, and that our proposed model eliminates latent ability and endogenous mobility biases.}, url = {http://digitalcommons.ilr.cornell.edu/ldi/29/}, author = {Kurt Lavetti and Ian M. Schmutte} } @article {2667, title = {Generating Partially Synthetic Geocoded Public Use Data with Decreased Disclosure Risk Using Differential Smoothing}, journal = {Journal of the Royal Statistical Society - Series A}, year = {2016}, abstract = {When collecting geocoded confidential data with the intent to disseminate, agencies often resort to altering the geographies prior to making data publicly available due to data privacy obligations. An alternative to releasing aggregated and/or perturbed data is to release multiply-imputed synthetic data, where sensitive values are replaced with draws from statistical models designed to capture important distributional features in the collected data. One issue that has received relatively little attention, however, is how to handle spatially outlying observations in the collected data, as common spatial models often have a tendency to overfit these observations. The goal of this work is to bring this issue to the forefront and propose a solution, which we refer to as "differential smoothing." After implementing our method on simulated data, highlighting the effectiveness of our approach under various scenarios, we illustrate the framework using data consisting of sale prices of homes in San Francisco.}, url = {https://arxiv.org/abs/1507.05529}, author = {Quick, H. and Holan, S.H. and Wikle, C.K.} } @techreport {handle:1813:52610, title = {Hours Off the Clock}, number = {1813:52610}, year = {2016}, institution = {Cornell University}, type = {Preprint}, abstract = {Hours Off the Clock Green, Andrew To what extent do workers work more hours than they are paid for? The relationship between hours worked and hours paid, and the conditions under which employers can demand more hours {\textquotedblleft}off the clock,{\textquotedblright} is not well understood. The answer to this question impacts worker welfare, as well as wage and hour regulation. In addition, work off the clock has important implications for the measurement and cyclical movement of productivity and wages. In this paper, I construct a unique administrative dataset of hours paid by employers linked to a survey of workers on their reported hours worked to measure work off the clock. Using cross-sectional variation in local labor markets, I find only a small cyclical component to work off the clock. The results point to labor hoarding rather than efficiency wage theory, indicating work off the clock cannot explain the counter-cyclical movement of productivity. I find workers employed by small firms, and in industries with a high rate of wage and hour violations are associated with larger differences in hours worked than hours paid. These findings suggest the importance of tracking hours of work for enforcement of labor regulations.}, url = {http://hdl.handle.net/1813/52610}, author = {Green, Andrew} } @article {2415, title = {How Should We Define Low-Wage Work? An Analysis Using the Current Population Survey}, journal = {Monthly Labor Review}, year = {2016}, month = {October}, abstract = {Low-wage work is a central concept in considerable research, yet it lacks an agreed-upon definition. Using data from the Current Population Survey{\textquoteright}s Annual Social and Economic Supplement, the analysis presented in this article suggests that defining low-wage work on the basis of alternative hourly wage cutoffs changes the size of the low-wage population, but does not noticeably alter time trends in the rate of change. The analysis also indicates that different definitions capture groups of workers with substantively different demographic, social, and economic characteristics. Although the individuals in any of the categories examined might reasonably be considered low-wage workers, a single definition obscures these distinctions.}, url = {http://www.bls.gov/opub/mlr/2016/article/pdf/how-should-we-define-low-wage-work.pdf}, author = {Fusaro, V. and Shaefer, H. Luke} } @techreport {handle:1813:44663, title = {How Will Statistical Agencies Operate When All Data Are Private?}, number = {1813:44663}, year = {2016}, institution = {Cornell University}, type = {Preprint}, abstract = {How Will Statistical Agencies Operate When All Data Are Private? Abowd, John M. The dual problems of respecting citizen privacy and protecting the confidentiality of their data have become hopelessly conflated in the {\textquotedblleft}Big Data{\textquotedblright} era. There are orders of magnitude more data outside an agency{\textquoteright}s firewall than inside it{\textemdash}compromising the integrity of traditional statistical disclosure limitation methods. And increasingly the information processed by the agency was {\textquotedblleft}asked{\textquotedblright} in a context wholly outside the agency{\textquoteright}s operations{\textemdash}blurring the distinction between what was asked and what is published. Already, private businesses like Microsoft, Google and Apple recognize that cybersecurity (safeguarding the integrity and access controls for internal data) and privacy protection (ensuring that what is published does not reveal too much about any person or business) are two sides of the same coin. This is a paradigm-shifting moment for statistical agencies.}, url = {http://hdl.handle.net/1813/44663}, author = {Abowd, John M.} } @article {2020, title = {Incorporating marginal prior information into latent class models}, journal = {Bayesian Analysis}, volume = {11}, year = {2016}, pages = {499-518}, doi = {doi:10.1214/15-BA959}, url = {https://projecteuclid.org/euclid.ba/1434649584}, author = {Schifeling, T. S. and Reiter, J. P.} } @article {2235, title = {Measuring Poverty Using the Supplemental Poverty Measure in the Panel Study of Income Dynamics, 1998 to 2010}, journal = {Journal of Economic and Social Measurement}, volume = {41}, year = {2016}, chapter = {17}, abstract = {The Supplemental Poverty Measure (SPM) was recently introduced by the U.S. Census Bureau as an alternative measure of poverty that addresses many shortcomings of the official poverty measure (OPM) to better reflect the resources households have available to meet their basic needs. The Census SPM is available only in the Current Population Survey (CPS). This paper describes a method for constructing SPM poverty estimates in the Panel Study of Income Dynamics (PSID), for the biennial years 1998 through 2010. A public-use dataset of individual-level SPM status produced in this analysis will be available for download on the PSID website. Annual SPM poverty estimates from the PSID are presented for the years 1998, 2000, 2002, 2004, 2006, 2008, and 2010 and compared to SPM estimates for the same years derived from CPS data by the Census Bureau and independent researchers. We find that SPM poverty rates in the PSID are somewhat lower than those found in the CPS, though trends over time and impact of specific SPM components are similar across the two datasets.}, doi = {10.3233/JEM-160425}, url = {http://content.iospress.com/articles/journal-of-economic-and-social-measurement/jem425}, author = {Kimberlin, S. and Shaefer, H.L. and Kim, J.} } @booklet {2515, title = {Mismatches}, year = {2016}, author = {Smyth, Jolene and Olson, Kristen} } @techreport {handle:1813:40306, title = {Modeling Endogenous Mobility in Earnings Determination}, number = {1813:40306}, year = {2016}, institution = {Cornell University}, type = {Preprint}, abstract = {Modeling Endogenous Mobility in Earnings Determination Abowd, John M.; McKinney, Kevin L.; Schmutte, Ian M. We evaluate the bias from endogenous job mobility in fixed-effects estimates of worker- and firm-specific earnings heterogeneity using longitudinally linked employer-employee data from the LEHD infrastructure file system of the U.S. Census Bureau. First, we propose two new residual diagnostic tests of the assumption that mobility is exogenous to unmodeled determinants of earnings. Both tests reject exogenous mobility. We relax the exogenous mobility assumptions by modeling the evolution of the matched data as an evolving bipartite graph using a Bayesian latent class framework. Our results suggest that endogenous mobility biases estimated firm effects toward zero. To assess validity, we match our estimates of the wage components to out-of-sample estimates of revenue per worker. The corrected estimates attribute much more of the variation in revenue per worker to variation in match quality and worker quality than the uncorrected estimates. Replication code can be found at DOI: http://doi.org/10.5281/zenodo.zenodo.376600 and our Github repository endogenous-mobility-replication .}, url = {http://hdl.handle.net/1813/40306}, author = {Abowd, John M. and McKinney, Kevin L. and Schmutte, Ian M.} } @article {doi:10.1080/01621459.2016.1174132, title = {Multiple Imputation of Missing Categorical and Continuous Values via Bayesian Mixture Models with Local Dependence}, journal = {Journal of the American Statistical Association}, year = {2016}, abstract = {We present a nonparametric Bayesian joint model for multivariate continuous and categorical variables, with the intention of developing a flexible engine for multiple imputation of missing values. The model fuses Dirichlet process mixtures of multinomial distributions for categorical variables with Dirichlet process mixtures of multivariate normal distributions for continuous variables. We incorporate dependence between the continuous and categorical variables by (i) modeling the means of the normal distributions as component-specific functions of the categorical variables and (ii) forming distinct mixture components for the categorical and continuous data with probabilities that are linked via a hierarchical model. This structure allows the model to capture complex dependencies between the categorical and continuous data with minimal tuning by the analyst. We apply the model to impute missing values due to item nonresponse in an evaluation of the redesign of the Survey of Income and Program Participation (SIPP). The goal is to compare estimates from a field test with the new design to estimates from selected individuals from a panel collected under the old design. We show that accounting for the missing data changes some conclusions about the comparability of the distributions in the two datasets. We also perform an extensive repeated sampling simulation using similar data from complete cases in an existing SIPP panel, comparing our proposed model to a default application of multiple imputation by chained equations. Imputations based on the proposed model tend to have better repeated sampling properties than the default application of chained equations in this realistic setting.}, doi = {10.1080/01621459.2016.1174132}, url = {http://dx.doi.org/10.1080/01621459.2016.1174132}, author = {Jared S. Murray and Jerome P. Reiter} } @article {2669, title = {Multivariate Spatio-Temporal Survey Fusion with Application to the American Community Survey and Local Area Unemployment Statistics}, journal = {Stat}, year = {2016}, pages = {224 - 233}, abstract = {There are often multiple surveys available that estimate and report related demographic variables of interest that are referenced over space and/or time. Not all surveys produce the same information, and thus, combining these surveys typically leads to higher quality estimates. That is, not every survey has the same level of precision nor do they always provide estimates of the same variables. In addition, various surveys often produce estimates with incomplete spatio-temporal coverage. By combining surveys using a Bayesian approach, we can account for different margins of error and leverage dependencies to produce estimates of every variable considered at every spatial location and every time point. Specifically, our strategy is to use a hierarchical modelling approach, where the first stage of the model incorporates the margin of error associated with each survey. Then, in a lower stage of the hierarchical model, the multivariate spatio-temporal mixed effects model is used to incorporate multivariate spatio-temporal dependencies of the processes of interest. We adopt a fully Bayesian approach for combining surveys; that is, given all of the available surveys, the conditional distributions of the latent processes of interest are used for statistical inference. To demonstrate our proposed methodology, we jointly analyze period estimates from the US Census Bureau{\textquoteright}s American Community Survey, and estimates obtained from the Bureau of Labor Statistics Local Area Unemployment Statistics program. Copyright {\textcopyright} 2016 John Wiley \& Sons, Ltd.}, url = {http://onlinelibrary.wiley.com/doi/10.1002/sta4.120/full}, author = {Bradley, J.R. and Holan, S.H. and Wikle, C.K} } @techreport {handle:1813:45885, title = {NCRN Meeting Fall 2016}, number = {1813:45885}, year = {2016}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Fall 2016 Vilhuber, Lars Taken place at the U.S. Census Bureau HQ, Washington DC.}, url = {http://hdl.handle.net/1813/45885}, author = {Vilhuber, Lars} } @techreport {handle:1813:45823, title = {NCRN Meeting Fall 2016: Audit Trails, Parallel Navigation, and the SIPP}, number = {1813:45823}, year = {2016}, institution = {University of Nebraska}, type = {Preprint}, abstract = {NCRN Meeting Fall 2016: Audit Trails, Parallel Navigation, and the SIPP Lee, Jinyoung Thanks to Dr. Robert Belli, Ana Luc{\'\i}a C{\'o}rdova Cazar, and Ben Seloske for the team effort.}, url = {http://hdl.handle.net/1813/45823}, author = {Lee, Jinyoung} } @techreport {handle:1813:45821, title = {NCRN Meeting Fall 2016: Scanner Data and Economic Statistics: A Unified Approach}, number = {1813:45821}, year = {2016}, institution = {University of Michigan}, type = {Preprint}, abstract = {NCRN Meeting Fall 2016: Scanner Data and Economic Statistics: A Unified Approach Redding, Stephen J.; Weinstein, David E.}, url = {http://hdl.handle.net/1813/45821}, author = {Redding, Stephen J. and Weinstein, David E.} } @techreport {handle:1813:45899, title = {NCRN Meeting Spring 2016}, number = {1813:45899}, year = {2016}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2016 Vilhuber, Lars Taken place at U.S. Census Bureau HQ, Washington DC.}, url = {http://hdl.handle.net/1813/45899}, author = {Vilhuber, Lars} } @techreport {handle:1813:43897, title = {NCRN Meeting Spring 2016: A 2016 View of 2020 Census Quality, Costs, Benefits}, number = {1813:43897}, year = {2016}, institution = {Northwestern University}, type = {Preprint}, abstract = {NCRN Meeting Spring 2016: A 2016 View of 2020 Census Quality, Costs, Benefits Spencer, Bruce D. Census costs affect data quality and data quality affects census benefits. Although measuring census data quality is difficult enough ex post, census planning requires it to be done well in advance. The topic of this talk is the prediction of the cost-quality curve, its uncertainty, and its relation to benefits from census data. Presented at the NCRN Meeting Spring 2016 in Washington DC on May 9-10, 2016; see http://www.ncrn.info/event/ncrn-spring-2016-meeting}, url = {http://hdl.handle.net/1813/43897}, author = {Spencer, Bruce D.} } @techreport {handle:1813:43889, title = {NCRN Meeting Spring 2016: Attitudes Towards Geolocation-Enabled Census Forms}, number = {1813:43889}, year = {2016}, institution = {Carnegie-Mellon University}, type = {Preprint}, abstract = {NCRN Meeting Spring 2016: Attitudes Towards Geolocation-Enabled Census Forms Brandimarte, Laura; Chiew, Ernest; Ventura, Sam; Acquisti, Alessandro Geolocation refers to the automatic identification of the physical locations of Internet users. In an online survey experiment, we studied respondent reactions towards different types of geolocation. After coordinating with US Census Bureau researchers, we designed and administered a replica of a census form to a sample of respondents. We also created slightly different forms by manipulating the type of geolocation implemented. Using the IP address of each respondent, we approximated the geographical coordinates of the respondent and displayed this location on a map on the survey. Across different experimental conditions, we manipulated the map interface between the three interfaces on the Google Maps API: default road map, Satellite View, and Street View. We also provided either a specific, pinpointed location, or a set of two circles of 1- and 2-miles radius. Snapshots of responses were captured at every instant information was added, altered, or deleted by respondents when completing the survey. We measured willingness to provide information on the typical Census form, as well as privacy concerns associated with geolocation technologies and attitudes towards the use of online geographical maps to identify one{\textquoteright}s exact current location. Presented at the NCRN Meeting Spring 2016 in Washington DC on May 9-10, 2016; see http://www.ncrn.info/event/ncrn-spring-2016-meeting}, url = {http://hdl.handle.net/1813/43889}, author = {Brandimarte, Laura and Chiew, Ernest and Ventura, Sam and Acquisti, Alessandro} } @techreport {handle:1813:43895, title = {NCRN Meeting Spring 2016: Developing job linkages for the Health and Retirement Study}, number = {1813:43895}, year = {2016}, institution = {University of Michigan}, type = {Preprint}, abstract = {NCRN Meeting Spring 2016: Developing job linkages for the Health and Retirement Study McCue, Kristin; Abowd, John; Levenstein, Margaret; Patki, Dhiren; Rodgers, Ann; Shapiro, Matthew; Wasi, Nada This paper documents work using probabilistic record linkage to create a crosswalk between jobs reported in the Health and Retirement Study (HRS) and the list of workplaces on Census Bureau{\textquoteright}s Business Register. Matching job records provides an opportunity to join variables that occur uniquely in separate datasets, to validate responses, and to develop missing data imputation models. Identifying the respondent{\textquoteright}s workplace ({\textquotedblleft}establishment{\textquotedblright}) is valuable for HRS because it allows researchers to incorporate the effects of particular social, economic, and geospatial work environments in studies of respondent health and retirement behavior. The linkage makes use of name and address standardizing techniques tailored to business data that were recently developed in a collaboration between researchers at Census, Cornell, and the University of Michigan. The matching protocol makes no use of the identity of the HRS respondent and strictly protects the confidentiality of information about the respondent{\textquoteright}s employer. The paper first describes the clerical review process used to create a set of human-reviewed candidate pairs, and use of that set to train matching models. It then describes and compares several linking strategies that make use of employer name, address, and phone number. Finally it discusses alternative ways of incorporating information on match uncertainty into estimates based on the linked data, and illustrates their use with a preliminary sample of matched HRS jobs. Presented at the NCRN Meeting Spring 2016 in Washington DC on May 9-10, 2016; see http://www.ncrn.info/event/ncrn-spring-2016-meeting}, url = {http://hdl.handle.net/1813/43895}, author = {Mccue, Kristin and Abowd, John and Levenstein, Margaret and Patki, Dhiren and Rodgers, Ann and Shapiro, Matthew and Wasi, Nada} } @techreport {handle:1813:43896, title = {NCRN Meeting Spring 2016: Evaluating Data quality in Time Diary Surveys Using Paradata}, number = {1813:43896}, year = {2016}, institution = {University of Nebraska}, type = {Preprint}, abstract = {NCRN Meeting Spring 2016: Evaluating Data quality in Time Diary Surveys Using Paradata C{\'o}rdova Cazar, Ana Luc{\'\i}a; Belli, Robert Over the past decades, time use researchers have been increasingly interested in analyzing wellbeing in tandem with the use of time (Juster and Stafford, 1985; Krueger et al, 2009). Many methodological issues have arose in this endeavor, including the concern about the quality of the time use data. Survey researchers have increasingly turned to the analysis of paradata to better understand and model data quality. In particular, it has been argued that paradata may serve as proxy of the respondents{\textquoteright} cognitive response process, and can be used as an additional tool to assess the impact of data generation on data quality. In this presentation, data quality in the American Time Use Survey (ATUS) will be assessed through the use of paradata and survey responses. Specifically, I will talk about a data quality index I have created, which includes measures of different types of ATUS errors (e.g. low number of reported activities, failures to report an activity), and paradata variables (e.g. response latencies, incompletes). The overall objective of this study is to contribute to data quality assessment in the collection of timeline data from national surveys by providing insights on those interviewing dynamics that most impact data quality. These insights will help to improve future instruments and training of interviewers, as well as to reduce costs. Presented at the NCRN Meeting Spring 2016 in Washington DC on May 9-10, 2016; see http://www.ncrn.info/event/ncrn-spring-2016-meeting}, url = {http://hdl.handle.net/1813/43896}, author = {C{\'o}rdova Cazar, Ana Luc{\'\i}a and Belli, Robert} } @techreport {handle:1813:43893, title = {NCRN Meeting Spring 2016: The ATUS and SIPP-EHC: Recent Developments}, number = {1813:43893}, year = {2016}, institution = {University of Nebraska}, type = {Preprint}, abstract = {NCRN Meeting Spring 2016: The ATUS and SIPP-EHC: Recent Developments Belli, Robert F. One of the main objectives of the NCRN award to the University of Nebraska node is to investigate data quality associated with timeline interviewing as conducted with the American Time Use Survey (ATUS) time diary and the Survey of Income and Program Participation event history calendar (SIPP-EHC). Specifically, our efforts are focused on the relationships between interviewing dynamics as extracted from analyses of paradata with measures of data quality. With the ATUS, our recent efforts have revealed that respondents differ in how they handle difficulty with remembering activities, with some overcoming these difficulties and others succumbing to them. With the SIPP-EHC, we are still in the initial stages of extracting variables from the paradata that are associated with interviewing dynamics. Our work has also involved the development of a CATI time diary in which we are able to analyze audio streams to capture interviewing dynamics. I will conclude this talk by discussing challenges that have yet to be overcome with our work, and our vision of moving forward with the eventual development of self-administered timeline instruments that will be respondent-friendly due to the assistance of intelligent-agent driven virtual interviewers. Presented at the NCRN Meeting Spring 2016 in Washington DC on May 9-10, 2016; see http://www.ncrn.info/event/ncrn-spring-2016-meeting}, url = {http://hdl.handle.net/1813/43893}, author = {Belli, Robert F.} } @techreport {handle:1813:52165, title = {NCRN Meeting Spring 2017: 2017 Economic Census: Towards Synthetic Data Sets}, number = {1813:52165}, year = {2016}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2017: 2017 Economic Census: Towards Synthetic Data Sets Caldwell, Carol; Thompson, Katherine Jenny}, url = {http://hdl.handle.net/1813/52165}, author = {Caldwell, Carol and Thompson, Katherine Jenny} } @techreport {handle:1813:52167, title = {NCRN Meeting Spring 2017: Differentially Private Verification of Regression Model Results}, number = {1813:52167}, year = {2016}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2017: Differentially Private Verification of Regression Model Results Reiter, Jerry}, url = {http://hdl.handle.net/1813/52167}, author = {Reiter, Jerry} } @techreport {handle:1813:52166, title = {NCRN Meeting Spring 2017: Practical Issues in Anonymity}, number = {1813:52166}, year = {2016}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2017: Practical Issues in Anonymity Clifton, Chris; Merill, Shawn; Merill, Keith}, url = {http://hdl.handle.net/1813/52166}, author = {Clifton, Chris and Merill, Shawn and Merill, Keith} } @techreport {handle:1813:42394, title = {NCRN Newsletter: Volume 2 - Issue 4}, number = {1813:42394}, year = {2016}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {

NCRN Newsletter: Volume 2 - Issue 4 Vilhuber, Lars; Karr, Alan; Reiter, Jerome; Abowd, John; Nunnelly, Jamie Overview of activities at NSF-Census Research Network nodes from September 2015 through December 2015. NCRN Newsletter Vol. 2, Issue 4: January 28, 2016.

}, url = {http://hdl.handle.net/1813/42394}, author = {Vilhuber, Lars and Karr, Alan and Reiter, Jerome and Abowd, John and Nunnelly, Jamie} } @techreport {handle:1813:44199, title = {NCRN Newsletter: Volume 3 - Issue 1}, number = {1813:44199}, year = {2016}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Newsletter: Volume 3 - Issue 1 Vilhuber, Lars; Karr, Alan; Reiter, Jerome; Abowd, John; Nunnelly, Jamie Overview of activities at NSF-Census Research Network nodes from January 2016 through May 2016. NCRN Newsletter Vol. 3, Issue 1: June 10, 2016}, url = {http://hdl.handle.net/1813/44199}, author = {Vilhuber, Lars and Karr, Alan and Reiter, Jerome and Abowd, John and Nunnelly, Jamie} } @techreport {handle:1813:46171, title = {NCRN Newsletter: Volume 3 - Issue 2}, number = {1813:46171}, year = {2016}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Newsletter: Volume 3 - Issue 2 Vilhuber, Lars; Knight-Ingram, Dory Overview of activities at NSF-Census Research Network nodes from June 2016 through December 2016. NCRN Newsletter Vol. 3, Issue 2: December 23, 2016}, url = {http://hdl.handle.net/1813/46171}, author = {Vilhuber, Lars and Knight-Ingram, Dory} } @article {2240, title = {Noise infusion as a confidentiality protection measure for graph-based statistics}, journal = {Statistical Journal of the International Association for Official Statistics}, volume = {32}, year = {2016}, pages = {127-135}, chapter = {127}, abstract = {We use the bipartite graph representation of longitudinally linked employer-employee data, and the associated projections onto the employer and employee nodes, respectively, to characterize the set of potential statistical summaries that the trusted custodian might produce. We consider noise infusion as the primary confidentiality protection method. We show that a relatively straightforward extension of the dynamic noise-infusion method used in the U.S. Census Bureau{\textquoteright}s Quarterly Workforce Indicators can be adapted to provide the same confidentiality guarantees for the graph-based statistics: all inputs have been modified by a minimum percentage deviation (i.e., no actual respondent data are used) and, as the number of entities contributing to a particular statistic increases, the accuracy of that statistic approaches the unprotected value. Our method also ensures that the protected statistics will be identical in all releases based on the same inputs.}, doi = {10.3233/SJI-160958}, url = {http://content.iospress.com/articles/statistical-journal-of-the-iaos/sji958}, author = {Abowd, John M. and McKinney, Kevin L.} } @techreport {handle:1813:46210, title = {The NSF-Census Research Network in 2016: Taking stock, looking forward}, number = {1813:46210}, year = {2016}, institution = {University of Missouri}, type = {Preprint}, abstract = {The NSF-Census Research Network in 2016: Taking stock, looking forward Vilhuber, Lars An overview of the activities of the NSF-Census Research Network as of 2016, given on Saturday, May 21, 2016, at a workshop on spatial and spatio-temporal design and analysis for official statistics, hosted by the Spatio-Temporal Statistics NSF Census Research Network (STSN) at the University of Missouri, and sponsored by the NSF-Census Research Network (NCRN)}, url = {http://hdl.handle.net/1813/46210}, author = {Vilhuber, Lars} } @article {2271, title = {Parallel associations and the structure of autobiographical knowledge}, journal = {Journal of Applied Research in Memory and Cognition}, volume = {5}, year = {2016}, month = {03/2016}, pages = {150{\textendash}157}, abstract = {The self-memory system (SMS) model of autobiographical knowledge conceives that memories are structured thematically, organized both hierarchically and temporally. This model has been challenged on several fronts, including the absence of parallel linkages across pathways. Calendar survey interviewing shows the frequent and varied use of parallel associations in autobiographical recall. Parallel associations in these data are commonplace, and are driven more by respondents{\textquoteright} generative retrieval than by interviewers{\textquoteright} probing. Parallel associations represent a number of autobiographical knowledge themes that are interrelated across life domains. The content of parallel associations is nearly evenly split between general and transitional events, supporting the importance of transitions in autographical memory. Associations in respondents{\textquoteright} memories (both parallel and sequential), demonstrate complex interactions with interviewer verbal behaviors during generative retrieval. In addition to discussing the implications of these results to the SMS model, implications are also drawn for transition theory and the basic-systems model.}, keywords = {Autobiographical memory; Autobiographical knowledge; Autobiographical periods; Episodic memory; Retrospective reports}, doi = {10.1016/j.jarmac.2016.03.004}, author = {Belli, R.F. and T. Al Baghal} } @techreport {handle:1813:52166, title = {Practical Issues in Anonymity}, number = {1813:52166}, year = {2016}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {Practical Issues in Anonymity Clifton, Chris; Merill, Shawn; Merill, Keith}, url = {http://hdl.handle.net/1813/52166}, author = {Clifton, Chris and Merill, Shawn and Merill, Keith} } @article {2399, title = {Probabilistic Record Linkage and Deduplication after Indexing, Blocking, and Filtering,}, journal = {Journal of Privacy and Confidentiality}, volume = {7}, year = {2016}, abstract = {Probabilistic record linkage, the task of merging two or more databases in the absence of a unique identifier, is a perennial and challenging problem. It is closely related to the problem of deduplicating a single database, which can be cast as linking a single database against itself. In both cases the number of possible links grows rapidly in the size of the databases under consideration, and in most applications it is necessary to first reduce the number of record pairs that will be compared. Spurred by practical considerations, a range of methods have been developed for this task. These methods go under a variety of names, including indexing and blocking, and have seen significant development. However, methods for inferring linkage structure that account for indexing, blocking, and additional filtering steps have not seen commensurate development. In this paper we review the implications of indexing, blocking and filtering within the popular Fellegi-Sunter framework, and propose a new model to account for particular forms of indexing and filtering.}, url = {http://repository.cmu.edu/jpc/vol7/iss1/2}, author = {Murray, J. S.} } @techreport {2016arXiv160806309D, title = {{Regression Modeling and File Matching Using Possibly Erroneous Matching Variables}}, number = {1608.06309}, year = {2016}, institution = {ArXiv}, abstract = {Many analyses require linking records from two databases comprising overlapping sets of individuals. In the absence of unique identifiers, the linkage procedure often involves matching on a set of categorical variables, such as demographics, common to both files. Typically, however, the resulting matches are inexact: some cross-classifications of the matching variables do not generate unique links across files. Further, the matching variables can be subject to reporting errors, which introduce additional uncertainty in analyses. We present a Bayesian file matching methodology designed to estimate regression models and match records simultaneously when categorical matching variables are subject to reporting error. The method relies on a hierarchical model that includes (1) the regression of interest involving variables from the two files given a vector indicating the links, (2) a model for the linking vector given the true values of the matching variables, (3) a measurement error model for reported values of the matching variables given their true values, and (4) a model for the true values of the matching variables. We describe algorithms for sampling from the posterior distribution of the model. We illustrate the methodology using artificial data and data from education records in the state of North Carolina.}, keywords = {Statistics - Applications}, url = {http://arxiv.org/abs/1608.06309}, author = {Dalzell, N.~M. and Reiter, J.~P.} } @article {2243, title = {Releasing synthetic magnitude micro data constrained to fixed marginal totals}, journal = {Statistical Journal of the International Association for Official Statistics}, volume = {32}, year = {2016}, month = {02/2016}, pages = {93-108}, chapter = {93}, abstract = {We present approaches to generating synthetic microdata for multivariate data that take on non-negative integer values, such as magnitude data in economic surveys. The basic idea is to estimate a mixture of Poisson distributions to describe the multivariate distribution, and release draws from the posterior predictive distribution of the model. We develop approaches that guarantee the synthetic data sum to marginal totals computed from the original data, as well approaches that do not enforce this equality. For both cases, we present methods for assessing disclosure risks inherent in releasing synthetic magnitude microdata. We illustrate the methodology using economic data from a survey of manufacturing establishments.}, keywords = {Confidential, Disclosure, establishment, mixture, poisson, risk}, doi = {10.3233/SJI-160959}, url = {http://content.iospress.com/download/statistical-journal-of-the-iaos/sji959}, author = {Wei, Lan and Reiter, Jerome P.} } @article {kim:reiter:karr16, title = {Simultaneous edit-imputation and disclosure limitation for business establishment data}, journal = {Journal of Applied Statistics}, year = {2016}, month = {12/2016}, abstract = {Business establishment microdata typically are required to satisfy agency-specified edit rules, such as balance equations and linear inequalities. Inevitably some establishments{\textquoteright} reported data violate the edit rules. Statistical agencies correct faulty values using a process known as edit-imputation. Business establishment data also must be heavily redacted before being shared with the public; indeed, confidentiality concerns lead many agencies not to share establishment microdata as unrestricted access files. When microdata must be heavily redacted, one approach is to create synthetic data, as done in the U.S. Longitudinal Business Database and the German IAB Establishment Panel. This article presents the first implementation of a fully integrated approach to edit-imputation and data synthesis. We illustrate the approach on data from the U.S. Census of Manufactures and present a variety of evaluations of the utility of the synthetic data. The paper also presents assessments of disclosure risks for several intruder attacks. We find that the synthetic data preserve important distributional features from the post-editing confidential microdata, and have low risks for the various attacks.}, doi = {10.1080/02664763.2016.1267123}, author = {H. J. Kim and J. P. Reiter and A. F. Karr} } @article {folch2016demography, title = {Spatial Variation in the Quality of {A}merican {C}ommunity {S}urvey Estimates}, journal = {Demography}, volume = {53}, number = {5}, year = {2016}, pages = {1535{\textendash}1554}, author = {Folch, David C. and Arribas-Bel, Daniel and Koschinsky, Julia and Spielman, Seth E.} } @article {2239, title = {Synthetic establishment microdata around the world}, journal = {Statistical Journal of the International Association for Official Statistics}, volume = {32}, year = {2016}, pages = {65-68}, chapter = {65}, abstract = {In contrast to the many public-use microdata samples available for individual and household data from many statistical agencies around the world, there are virtually no establishment or firm microdata available. In large part, this difficulty in providing access to business microdata is due to the skewed and sparse distributions that characterize business data. Synthetic data are simulated data generated from statistical models. We organized sessions at the 2015 World Statistical Congress and the 2015 Joint Statistical Meetings, highlighting work on synthetic \emph{establishment} microdata. This overview situates those papers, published in this issue, within the broader literature.}, keywords = {Business data, confidentiality, differential privacy, international comparison, Multiple imputation, synthetic}, doi = {10.3233/SJI-160964}, url = {http://content.iospress.com/download/statistical-journal-of-the-iaos/sji964}, author = {Vilhuber, Lars and Abowd, John M. and Reiter, Jerome P.} } @mastersthesis {2484, title = {Topics on Official Statistics and Statistical Policy}, volume = {PHD}, year = {2016}, month = {09/2016}, pages = {24}, school = {Northwestern University}, address = {Evanston, Illinois }, abstract = {My dissertation studies decision questions for government statistical agencies, both regarding data collection and how to combine data from multiple sources. Informed decisions regarding expenditure on data collection require information about the effects of data quality on data use. For the first topic, I study two important uses of decennial census data in the U.S.: for apportioning the House of Representatives and for allocating federal funds. Estimates of distortions in these two uses are developed for different levels of census accuracy. Then, I thoroughly investigate the sensitivity of findings to the census error distribution and to the choice of how to measure the distortions. The chapter concludes with a proposed framework for partial cost-benefit analysis that charges a share of the cost of the census to allocation programs. Then, I investigate an approximation to make analysis of the effects of census error on allocations feasible when allocations also depend on non-census statistics, as is the case for many formula-based allocations. The approximation conditions on the realized values of the non-census statistics instead of using the joint distribution over both census and non-census statistics. The research studies how using the approximation affects conclusions. I find that in some simple cases, the approximation always either overstates or equals the true effects of census error. Understatement is possible in other cases, but theory suggests that the largest possible understatements are about one-third the amount of the largest possible overstatements. In simulations with a more complex allocation formula, the approximation tends to overstate the effects of census error with the overstatement increasing with error in non-census statistics but decreasing with error in census statistics. In the final chapter, I evaluate the use of 2008-2010 property tax data from CoreLogic, Inc. (CoreLogic), aggregated from county and township governments from around the country, to improve 2010 American Community Survey (ACS) estimates of property tax amounts for single-family homes. Particularly, I evaluate the potential to use CoreLogic to reduce respondent burden, to study survey response error and to improve adjustments for survey nonresponse. The coverage of the CoreLogic data varies between counties as does the correspondence between ACS and CoreLogic property taxes. This geographic variation implies that different approaches toward using CoreLogic are needed in different areas of the country. Further, large differences between CoreLogic and ACS property taxes in certain counties seem to be due to conceptual differences between what is collected in the two data sources. I examine three counties, Clark County, NV, Philadelphia County, PA and St. Louis County, MO, and compare how estimates would change with different approaches using the CoreLogic data. Mean county property tax estimates are highly sensitive to whether ACS or CoreLogic data are used to construct estimates. Using CoreLogic data in imputation modeling for nonresponse adjustment of ACS estimates modestly improves the predictive power of imputation models, although estimates of county property taxes and property taxes by mortgage status are not very sensitive to the imputation method.}, url = {http://search.proquest.com/docview/1826016819}, author = {Zachary Seeskin} } @article {2500, title = {Using Data Mining to Predict the Occurrence of Respondent Retrieval Strategies in Calendar Interviewing: The Quality of Retrospective Reports}, journal = {Journal of Official Statistics}, volume = {32}, year = {2016}, month = {2016}, pages = {579-600}, abstract = {Determining which verbal behaviors of interviewers and respondents are dependent on one another is a complex problem that can be facilitated via data-mining approaches. Data are derived from the interviews of 153 respondents of the Panel Study of Income Dynamics (PSID) who were interviewed about their life-course histories. Behavioral sequences of interviewer-respondent interactions that were most predictive of respondents spontaneously using parallel, timing, duration, and sequential retrieval strategies in their generation of answers were examined. We also examined which behavioral sequences were predictive of retrospective reporting data quality as shown by correspondence between calendar responses with responses collected in prior waves of the PSID. The verbal behaviors of immediately preceding interviewer and respondent turns of speech were assessed in terms of their co-occurrence with each respondent retrieval strategy. Interviewers{\textquoteright} use of parallel probes is associated with poorer data quality, whereas interviewers{\textquoteright} use of timing and duration probes, especially in tandem, is associated with better data quality. Respondents{\textquoteright} use of timing and duration strategies is also associated with better data quality and both strategies are facilitated by interviewer timing probes. Data mining alongside regression techniques is valuable to examine which interviewer-respondent interactions will benefit data quality. }, doi = {https://doi.org/10.1515/jos-2016-0030}, author = {Belli, Robert F. and Miller, L. Dee and Baghal, Tarek Al and Soh, Leen-Kiat} } @article {2238, title = {Using partially synthetic microdata to protect sensitive cells in business statistics}, journal = {Statistical Journal of the International Association for Official Statistics}, volume = {32}, year = {2016}, month = {2016}, pages = {69-80}, chapter = {69}, abstract = {We describe and analyze a method that blends records from both observed and synthetic microdata into public-use tabulations on establishment statistics. The resulting tables use synthetic data only in potentially sensitive cells. We describe different algorithms, and present preliminary results when applied to the Census Bureau{\textquoteright}s Business Dynamics Statistics and Synthetic Longitudinal Business Database, highlighting accuracy and protection afforded by the method when compared to existing public-use tabulations (with suppressions).}, keywords = {confidentiality protection, gross job flows, local labor markets, Statistical Disclosure Limitation, Synthetic data, time-series}, doi = {10.3233/SJI-160963}, url = {http://content.iospress.com/download/statistical-journal-of-the-iaos/sji963}, author = {Miranda, Javier and Vilhuber, Lars} } @techreport {2570, title = {Why Statistical Agencies Need to Take Privacy-loss Budgets Seriously, and What It Means When They Do}, year = {2016}, url = {http://digitalcommons.ilr.cornell.edu/ldi/32/}, author = {John M. Abowd} } @article {1866, title = {Accounting for nonignorable unit nonresponse and attrition in panel studies with refreshment samples}, journal = {Journal of Survey Statistics and Methodology}, volume = {3}, year = {2015}, pages = {265-295}, chapter = {265}, abstract = { Panel surveys typically su↵er from attrition, which can lead to biased inference when basing analysis only on cases that complete all waves of the panel. Unfortunately, panel data alone cannot inform the extent of the bias from the attrition, so that analysts using the panel data alone must make strong and untestable assumptions about the missing data mechanism. Many panel studies also include refreshment samples, which are data collected from a random sample of new individuals during some later wave of the panel. Refreshment samples o↵er information that can be utilized to correct for biases induced by nonignorable attrition while reducing reliance on strong assumptions about the attrition process. To date, these bias correction methods have not dealt with two key practical issues in panel studies: unit nonresponse in the initial wave of the panel and in the refreshment sample itself. As we illustrate, nonignorable unit nonresponse can significantly compromise the analyst{\textquoteright}s ability to use the refreshment samples for attrition bias correction. Thus, it is crucial for analysts to assess how sensitive their inferences{\textemdash}corrected for panel attrition{\textemdash}are to di↵erent assumptions about the nature of the unit nonresponse. We present an approach that facilitates such sensitivity analyses, both for suspected nonignorable unit nonresponse in the initial wave and in the refreshment sample. We illustrate the approach using simulation studies and an analysis of data from the 2007-2008 Associated Press/Yahoo News election panel study. }, doi = {10.1093/jssam/smv007}, url = {http://jssam.oxfordjournals.org/content/3/3/265.abstract}, author = {Schifeling, T. and Cheng, C. and Hillygus, D. S. and Reiter, J. P.} } @article {1739, title = {Bayesian Analysis of Spatially-Dependent Functional Responses with Spatially-Dependent Multi-Dimensional Functional Predictors}, journal = {Statistica Sinica}, volume = {25}, year = {2015}, chapter = {205-223}, doi = {10.5705/ss.2013.245w }, url = {http://www3.stat.sinica.edu.tw/preprint/SS-13-245w_Preprint.pdf}, author = {Yang, W. H. and Wikle, C.K. and Holan, S.H. and Sudduth, K. and Meyers, D.B.} } @article {1741, title = {Bayesian Binomial Mixture Models for Estimating Abundance in Ecological Monitoring Studies}, journal = {Annals of Applied Statistics}, volume = {9}, year = {2015}, pages = {1-26}, doi = {10.1214/14-AOAS801}, url = {http://projecteuclid.org/euclid.aoas/1430226082}, author = {Wu, G. and Holan, S.H. and Nilon, C.H. and Wikle, C.K.} } @article {Zhuangtoappear, title = {Bayesian Hierarchical Statistical SIRS Models}, journal = {Statistical Methods and Applications}, volume = {23}, year = {2015}, pages = {601-646}, doi = {10.1007/s10260-014-0280-9}, author = {Zhuang, L. and Cressie, N.} } @article {2126, title = {Bayesian Latent Pattern Mixture Models for Handling Attrition in Panel Studies With Refreshment Samples}, journal = {ArXiv}, year = {2015}, month = {09/2015}, abstract = {Many panel studies collect refreshment samples---new, randomly sampled respondents who complete the questionnaire at the same time as a subsequent wave of the panel. With appropriate modeling, these samples can be leveraged to correct inferences for biases caused by non-ignorable attrition. We present such a model when the panel includes many categorical survey variables. The model relies on a Bayesian latent pattern mixture model, in which an indicator for attrition and the survey variables are modeled jointly via a latent class model. We allow the multinomial probabilities within classes to depend on the attrition indicator, which offers additional flexibility over standard applications of latent class models. We present results of simulation studies that illustrate the benefits of this flexibility. We apply the model to correct attrition bias in an analysis of data from the 2007-2008 Associated Press/Yahoo News election panel study. }, keywords = {Categorical, Dirichlet pro- cess, Multiple imputation, Non-ignorable, Panel attrition, Refreshment sample}, url = {http://arxiv.org/abs/1509.02124}, author = {Yajuan Si and Jerome P. Reiter and D. Sunshine Hillygus} } @article {2015arXiv:1408.2757, title = {Bayesian Lattice Filters for Time-Varying Autoregression and Time-Frequency Analysis}, journal = {ArXiv}, number = {TEST 2}, year = {2015}, abstract = {Modeling nonstationary processes is of paramount importance to many scientific disciplines including environmental science, ecology, and finance, among others. Consequently, flexible methodology that provides accurate estimation across a wide range of processes is a subject of ongoing interest. We propose a novel approach to model-based time-frequency estimation using time-varying autoregressive models. In this context, we take a fully Bayesian approach and allow both the autoregressive coefficients and innovation variance to vary over time. Importantly, our estimation method uses the lattice filter and is cast within the partial autocorrelation domain. The marginal posterior distributions are of standard form and, as a convenient by-product of our estimation method, our approach avoids undesirable matrix inversions. As such, estimation is extremely computationally efficient and stable. To illustrate the effectiveness of our approach, we conduct a comprehensive simulation study that compares our method with other competing methods and find that, in most cases, our approach performs superior in terms of average squared error between the estimated and true time-varying spectral density. Lastly, we demonstrate our methodology through three modeling applications; namely, insect communication signals, environmental data (wind components), and macroeconomic data (US gross domestic product (GDP) and consumption).}, url = {http://arxiv.org/abs/1408.2757}, author = {Yang, W.~H. and Holan, S.~H. and Wikle, C.K.} } @article {2221, title = {Bayesian Lattice Filters for Time-Varying Autoregression and Time{\textendash}Frequency Analysis}, journal = {Project Euclid}, year = {2015}, month = {10/2015}, pages = {27}, abstract = {Modeling nonstationary processes is of paramount importance to many scientific disciplines including environmental science, ecology, and finance, among others. Consequently, flexible methodology that provides accurate estimation across a wide range of processes is a subject of ongoing interest. We propose a novel approach to model-based time{\textendash}frequency estimation using time-varying autoregressive models. In this context, we take a fully Bayesian approach and allow both the autoregressive coefficients and innovation variance to vary over time. Importantly, our estimation method uses the lattice filter and is cast within the partial autocorrelation domain. The marginal posterior distributions are of standard form and, as a convenient by-product of our estimation method, our approach avoids undesirable matrix inversions. As such, estimation is extremely computationally efficient and stable. To illustrate the effectiveness of our approach, we conduct a comprehensive simulation study that compares our method with other competing methods and find that, in most cases, our approach performs superior in terms of average squared error between the estimated and true time-varying spectral density. Lastly, we demonstrate our methodology through three modeling applications; namely, insect communication signals, environmental data (wind components), and macroeconomic data (US gross domestic product (GDP) and consumption).}, keywords = {locally stationary, model selection, nonstationary partial autocorrelation, piecewise stationary, sequential estimation, time-varying spectral density}, doi = {10.1214/15-BA978}, url = {http://projecteuclid.org/euclid.ba/1445263834}, author = {Yang, W.~H. and Holan, Scott H. and Wikle, Christopher K.} } @article {2039, title = {Bayesian Marked Point Process Modeling for Generating Fully Synthetic Public Use Data with Point-Referenced Geography}, journal = {Spatial Statistics}, volume = {14}, year = {2015}, month = {08/2015}, pages = {439--451}, doi = {10.1016/j.spasta.2015.07.008}, url = {http://www.sciencedirect.com/science/article/pii/S2211675315000718}, author = {Quick, Harrison and Holan, Scott H. and Wikle, Christopher K. and Reiter, Jerome P.} } @article {2015arXiv:1407.7795, title = {Bayesian Marked Point Process Modeling for Generating Fully Synthetic Public Use Data with Point-Referenced Geography}, journal = {ArXiv}, year = {2015}, abstract = {Many data stewards collect confidential data that include fine geography. When sharing these data with others, data stewards strive to disseminate data that are informative for a wide range of spatial and non-spatial analyses while simultaneously protecting the confidentiality of data subjects{\textquoteright} identities and attributes. Typically, data stewards meet this challenge by coarsening the resolution of the released geography and, as needed, perturbing the confidential attributes. When done with high intensity, these redaction strategies can result in released data with poor analytic quality. We propose an alternative dissemination approach based on fully synthetic data. We generate data using marked point process models that can maintain both the statistical properties and the spatial dependence structure of the confidential data. We illustrate the approach using data consisting of mortality records from Durham, North Carolina.}, url = {http://arxiv.org/abs/1407.7795}, author = {Quick, H. and Holan, S.~H. and Wikle, C.~K. and Reiter, J.~P.} } @article {2088, title = {Bayesian Semiparametric Hierarchical Empirical Likelihood Spatial Models}, journal = {Journal of Statistical Planning and Inference}, volume = {165}, year = {2015}, month = {10/2015}, pages = {78-90}, issn = {0378-3758}, doi = {10.1016/j.jspi.2015.04.002}, author = {Porter, A.T. and Holan, S.H. and Wikle, C.K.} } @article {2204, title = {Bayesian Spatial Change of Support for Count-Valued Survey Data with Application to the American Community Survey}, journal = {Journal of the American Statistical Association}, year = {2015}, month = {12/2015}, abstract = {We introduce Bayesian spatial change of support methodology for count-valued survey data with known survey variances. Our proposed methodology is motivated by the American Community Survey (ACS), an ongoing survey administered by the U.S. Census Bureau that provides timely information on several key demographic variables. Specifically, the ACS produces 1-year, 3-year, and 5-year {\textquotedblleft}period-estimates,{\textquotedblright} and corresponding margins of errors, for published demographic and socio-economic variables recorded over predefined geographies within the United States. Despite the availability of these predefined geographies it is often of interest to data-users to specify customized user-defined spatial supports. In particular, it is useful to estimate demographic variables defined on {\textquotedblleft}new{\textquotedblright} spatial supports in {\textquotedblleft}real-time.{\textquotedblright} This problem is known as spatial change of support (COS), which is typically performed under the assumption that the data follows a Gaussian distribution. However, count-valued survey data is naturally non-Gaussian and, hence, we consider modeling these data using a Poisson distribution. Additionally, survey-data are often accompanied by estimates of error, which we incorporate into our analysis. We interpret Poisson count-valued data in small areas as an aggregation of events from a spatial point process. This approach provides us with the flexibility necessary to allow ACS users to consider a variety of spatial supports in {\textquotedblleft}real-time.{\textquotedblright} We show the effectiveness of our approach through a simulated example as well as through an analysis using public-use ACS data.}, doi = {10.1080/01621459.2015.1117471}, url = {http://www.tandfonline.com/doi/abs/10.1080/01621459.2015.1117471}, author = {Bradley, Jonathan and Wikle, C.K. and Holan, S.~H.} } @article {2219, title = {Bayesian Spatial Change of Support for Count-Valued Survey Data with Application to the American Community Survey}, journal = {Journal of the American Statistical Association}, year = {2015}, month = {12/2015}, abstract = {We introduce Bayesian spatial change of support methodology for count-valued survey data with known survey variances. Our proposed methodology is motivated by the American Community Survey (ACS), an ongoing survey administered by the U.S. Census Bureau that provides timely information on several key demographic variables. Specifically, the ACS produces 1-year, 3-year, and 5-year {\textquotedblleft}period-estimates,{\textquotedblright} and corresponding margins of errors, for published demographic and socio-economic variables recorded over predefined geographies within the United States. Despite the availability of these predefined geographies it is often of interest to data-users to specify customized user-defined spatial supports. In particular, it is useful to estimate demographic variables defined on {\textquotedblleft}new{\textquotedblright} spatial supports in {\textquotedblleft}real-time.{\textquotedblright} This problem is known as spatial change of support (COS), which is typically performed under the assumption that the data follows a Gaussian distribution. However, count-valued survey data is naturally non-Gaussian and, hence, we consider modeling these data using a Poisson distribution. Additionally, survey-data are often accompanied by estimates of error, which we incorporate into our analysis. We interpret Poisson count-valued data in small areas as an aggregation of events from a spatial point process. This approach provides us with the flexibility necessary to allow ACS users to consider a variety of spatial supports in {\textquotedblleft}real-time.{\textquotedblright} We show the effectiveness of our approach through a simulated example as well as through an analysis using public-use ACS data.}, keywords = {Aggregation, American Community Survey, Bayesian hierarchical model, Givens angle prior, Markov chain Monte Carlo, Multiscale model, Non-Gaussian.}, doi = {10.1080/01621459.2015.1117471}, url = {http://www.tandfonline.com/doi/abs/10.1080/01621459.2015.1117471}, author = {Bradley, Jonathan R. and Wikle, Christopher K. and Holan, Scott H.} } @article {2015arXiv:1405.7227, title = {Bayesian Spatial Change of Support for Count{\textendash}Valued Survey Data}, journal = {ArXiv}, year = {2015}, abstract = {We introduce Bayesian spatial change of support methodology for count-valued survey data with known survey variances. Our proposed methodology is motivated by the American Community Survey (ACS), an ongoing survey administered by the U.S. Census Bureau that provides timely information on several key demographic variables. Specifically, the ACS produces 1-year, 3-year, and 5-year "period-estimates," and corresponding margins of errors, for published demographic and socio-economic variables recorded over predefined geographies within the United States. Despite the availability of these predefined geographies it is often of interest to data users to specify customized user-defined spatial supports. In particular, it is useful to estimate demographic variables defined on "new" spatial supports in "real-time." This problem is known as spatial change of support (COS), which is typically performed under the assumption that the data follows a Gaussian distribution. However, count-valued survey data is naturally non-Gaussian and, hence, we consider modeling these data using a Poisson distribution. Additionally, survey-data are often accompanied by estimates of error, which we incorporate into our analysis. We interpret Poisson count-valued data in small areas as an aggregation of events from a spatial point process. This approach provides us with the flexibility necessary to allow ACS users to consider a variety of spatial supports in "real-time." We demonstrate the effectiveness of our approach through a simulated example as well as through an analysis using public-use ACS data.}, url = {http://arxiv.org/abs/1405.7227}, author = {Bradley, J.~R. and Wikle, C.K. and Holan, S.~H.} } @techreport {steorts_2015_syria, title = {{Blocking Methods Applied to Casualty Records from the Syrian Conflict}}, number = {1510.07714}, year = {2015}, url = {http://arxiv.org/abs/1510.07714}, author = {Sadosky, Peter and Shrivastava, Anshumali and Price, Megan and Steorts, Rebecca} } @article {1877, title = {Capturing multivariate spatial dependence: Model, estimate, and then predict}, journal = {Statistical Science}, volume = {30}, year = {2015}, month = {06/2015}, pages = {170-175}, doi = {10.1214/15-STS517}, url = {http://projecteuclid.org/euclid.ss/1433341474}, author = {Cressie, N. and Burden, S. and Davis, W. and Krivitsky, P. and Mokhtarian, P. and Seusse, T. and Zammit-Mangion, A.} } @techreport {2264, title = {Categorical data fusion using auxiliary information}, number = {1506.05886}, year = {2015}, institution = {arXiv}, abstract = {In data fusion analysts seek to combine information from two databases comprised of disjoint sets of individuals, in which some variables appear in both databases and other variables appear in only one database. Most data fusion techniques rely on variants of conditional independence assumptions. When inappropriate, these assumptions can result in unreliable inferences. We propose a data fusion technique that allows analysts to easily incorporate auxiliary information on the dependence structure of variables not observed jointly; we refer to this auxiliary information as glue. With this technique, we fuse two marketing surveys from the book publisher HarperCollins using glue from the online, rapid-response polling company CivicScience. The fused data enable estimation of associations between people{\textquoteright}s preferences for authors and for learning about new books. The analysis also serves as a case study on the potential for using online surveys to aid data fusion.}, url = {http://arxiv.org/abs/1506.05886}, author = {Fosdick, B. K. and Maria DeYoreo and J. P. Reiter} } @article {1825, title = {Change in Visible Impervious Surface Area in Southeastern Michigan Before and After the {\textquotedblleft}Great Recession:{\textquotedblright} Spatial Differentiation in Remotely Sensed Land-Cover Dynamics}, journal = {Population and Environment}, volume = {36}, year = {2015}, month = {03/2015}, pages = {331-355}, chapter = {331}, doi = {10.1007/s11111-014-0219-y}, url = {http://link.springer.com/article/10.1007\%2Fs11111-014-0219-y}, author = {Wilson, C. R. and Brown, D. G.} } @conference {2107, title = {Changing {\textquoteleft}Who{\textquoteright} or {\textquoteleft}Where{\textquoteright}: Implications for Data Quality in the American Time Use Survey}, booktitle = {70th Annual Conference of the American Association for Public Opinion Research (AAPOR)}, year = {2015}, address = {Hollywood, Florida}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Deal, C.E. and Kirchner, A. and Cordova-Cazar, A.L. and Ellyne, L. and Belli, R.F.} } @article {2086, title = {Comment on Article by Ferreira and Gamerman}, journal = {Bayesian Analysis}, volume = {10}, year = {2015}, month = {04/2015}, pages = {741-748}, doi = {doi:10.1214/15-BA944B}, url = {http://projecteuclid.org/euclid.ba/1429880217}, author = {Cressie, N. and Chambers, R. L.} } @article {1883, title = {Comment on {\textquoteleft}{\textquoteleft}Semiparametric Bayesian Density Estimation with Disparate Data Sources: A Meta-Analysis of Global Childhood Undernutrition" by Finncane, M. M., Paciorek, C. J., Stevens, G. A., and Ezzati, M.}, journal = {Journal of the American Statistical Association}, year = {2015}, author = {Wikle, C.K. and Holan, S.H.} } @article {1878, title = {Comment: Spatial sampling designs depend as much on {\textquotedblleft}how much?{\textquotedblright} and {\textquotedblleft}why?{\textquotedblright} as on {\textquotedblleft}where?{\textquotedblright}}, journal = {Bayesian Analysis}, year = {2015}, abstract = {A comment on {\textquotedblleft}Optimal design in geostatistics under preferential sampling{\textquotedblright} by G. da Silva Ferreira and D. Gamerman}, author = {Cressie, N. and Chambers, R. L.} } @article {10.1257/jel.53.3.631, title = {Communicating Uncertainty in Official Economic Statistics: An Appraisal Fifty Years after Morgenstern}, journal = {Journal of Economic Literature}, volume = {53}, year = {2015}, month = {09/2015}, pages = {631-53}, abstract = {Federal statistical agencies in the United States and analogous agencies elsewhere commonly report official economic statistics as point estimates, without accompanying measures of error. Users of the statistics may incorrectly view them as error free or may incorrectly conjecture error magnitudes. This paper discusses strategies to mitigate misinterpretation of official statistics by communicating uncertainty to the public. Sampling error can be measured using established statistical principles. The challenge is to satisfactorily measure the various forms of nonsampling error. I find it useful to distinguish transitory statistical uncertainty, permanent statistical uncertainty, and conceptual uncertainty. I illustrate how each arises as the Bureau of Economic Analysis periodically revises GDP estimates, the Census Bureau generates household income statistics from surveys with nonresponse, and the Bureau of Labor Statistics seasonally adjusts employment statistics. I anchor my discussion of communication of uncertainty in the contribution of Oskar Morgenstern (1963a), who argued forcefully for agency publication of error estimates for official economic statistics. (JEL B22, C82, E23)}, keywords = {and Organizing Macroeconomic Data; Data Access E23: Macroeconomics: Production, B22: History of Economic Thought: Macroeconomics C82: Methodology for Collecting, Estimating}, doi = {10.1257/jel.53.3.631}, url = {http://www.aeaweb.org/articles.php?doi=10.1257/jel.53.3.631}, author = {Manski, Charles F.} } @article {2083, title = {Comparing and selecting spatial predictors using local criteria}, journal = {Test}, volume = {24}, year = {2015}, month = {03/2015}, pages = {1-28}, chapter = {1}, issn = {1133-0686}, doi = {10.1007/s11749-014-0415-1}, url = {http://dx.doi.org/10.1007/s11749-014-0415-1}, author = {Bradley, J.R. and Cressie, N. and Shi, T.} } @mastersthesis {2033, title = {A Comparison of Multiple Imputation Methods for Categorical Data (Master{\textquoteright}s Thesis)}, year = {2015}, school = {Duke University}, type = {Masters}, author = {Akande, O.} } @techreport {1989, title = {Cost-Benefit Analysis for a Quinquennial Census: The 2016 Population Census of South Africa.}, number = {WP-15-06}, year = {2015}, institution = {Northwestern University, Institute for Policy Research}, type = {Working Paper}, abstract = {

The question of whether to carry out a quinquennial census is being faced by national statistical offices in increasingly many countries, including Canada, Nigeria, Ireland, Australia, and South Africa. The authors describe uses, and limitations, of cost-benefit analysis for this decision problem in the case of the 2016 census of South Africa. The government of South Africa needed to decide whether to conduct a 2016 census or to rely on increasingly inaccurate post-censal estimates accounting for births, deaths, and migration since the previous (2011) census. The cost-benefit analysis compared predicted costs of the 2016 census to the benefits from improved allocation of intergovernmental revenue, which was considered by the government to be a critical use of the 2016 census, although not the only important benefit. Without the 2016 census, allocations would be based on population estimates. Accuracy of the post-censal estimates was estimated from the performance of past estimates, and the hypothetical expected reduction in errors in allocation due to the 2016 census was estimated. A loss function was introduced to quantify the improvement in allocation. With this evidence, the government was able to decide not to conduct the 2016 census, but instead to improve data and capacity for producing post-censal estimates.

}, keywords = {demographic statistics, fiscal allocations, loss function, population estimates, post-censal estimates}, url = {http://www.ipr.northwestern.edu/publications/papers/2015/ipr-wp-15-06.html}, author = {Spencer, Bruce D. and May, Julian and Kenyon, Steven and Seeskin, Zachary H.} } @conference {2120, title = {Determining Potential for Breakoff in Time Diary Survey Using Paradata}, booktitle = {70th Annual Conference of the American Association for Public Opinion Research (AAPOR)}, year = {2015}, month = {05/2015}, address = {Hollywood, Florida}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Wettlaufer, D. and Arunachalam, H. and Atkin, G. and Eck, A. and Soh, L.-K. and Belli, R.F.} } @article {2040, title = {Dirichlet Process Mixture Models for Nested Categorical Data}, journal = {ArXiv}, year = {2015}, abstract = {We present a Bayesian model for estimating the joint distribution of multivariate categorical data when units are nested within groups. Such data arise frequently in social science settings, for example, people living in households. The model assumes that (i) each group is a member of a group-level latent class, and (ii) each unit is a member of a unit-level latent class nested within its group-level latent class. This structure allows the model to capture dependence among units in the same group. It also facilitates simultaneous modeling of variables at both group and unit levels. We develop a version of the model that assigns zero probability to groups and units with physically impossible combinations of variables. We apply the model to estimate multivariate relationships in a subset of the American Community Survey. Using the estimated model, we generate synthetic household data that could be disseminated as redacted public use files with high analytic validity and low disclosure risks. Supplementary materials for this article are available online.}, url = {http://arxiv.org/pdf/1412.2282v3.pdf}, author = {Hu, J. and Reiter, J.P. and Wang, Q.} } @mastersthesis {2032, title = {Dirichlet Process Mixture Models for Nested Categorical Data (Ph.D. Thesis)}, year = {2015}, school = {Duke University}, type = {Ph.D.}, url = {http://dukespace.lib.duke.edu/dspace/handle/10161/9933}, author = {Hu, J.} } @conference {2116, title = {Do Interviewers with High Cooperation Rates Behave Differently? Interviewer Cooperation Rates and Interview Behaviors}, booktitle = {International Conference on Total Survey Error}, year = {2015}, month = {09/2015}, address = {Baltimore, MD}, url = {http://www.niss.org/events/2015-international-total-survey-error-conference}, author = {Olson, K. and Smyth, J.D. and Kirchner, A.} } @conference {2115, title = {Do Interviewers with High Cooperation Rates Behave Differently? Interviewer Cooperation Rates and Interview Behaviors}, booktitle = {Joint Statistical Meetings}, year = {2015}, month = {08/2015}, address = {Seattle, WA}, url = {http://www.amstat.org/meetings/jsm/2015/program.cfm}, author = {Olson, K. and Smyth, J.D. and Kirchner, A.} } @mastersthesis {2031, title = {Dynamic Models of Human Capital Accumulation (Ph.D. Thesis)}, year = {2015}, school = {Duke University}, type = {Ph.D.}, url = {http://dukespace.lib.duke.edu/dspace/handle/10161/9929}, author = {Ransom, T.} } @techreport {handle:1813:40581, title = {Economic Analysis and Statistical Disclosure Limitation}, number = {1813:40581}, year = {2015}, institution = {Cornell University}, type = {Preprint}, abstract = {

Economic Analysis and Statistical Disclosure Limitation Abowd, John M.; Schmutte, Ian M. This paper explores the consequences for economic research of methods used by data publishers to protect the privacy of their respondents. We review the concept of statistical disclosure limitation for an audience of economists who may be unfamiliar with these methods. We characterize what it means for statistical disclosure limitation to be ignorable. When it is not ignorable, we consider the effects of statistical disclosure limitation for a variety of research designs common in applied economic research. Because statistical agencies do not always report the methods they use to protect confidentiality, we also characterize settings in which statistical disclosure limitation methods are discoverable; that is, they can be learned from the released data. We conclude with advice for researchers, journal editors, and statistical agencies.

}, url = {http://hdl.handle.net/1813/40581}, author = {Abowd, John M. and Schmutte, Ian M.} } @article {2057, title = {Economic Analysis and Statistical Disclosure Limitation}, journal = {Brookings Papers on Economic Activity}, volume = {Spring 2015}, year = {2015}, month = {03/2015}, abstract = {Economic Analysis and Statistical Disclosure Limitation Abowd, John M.; Schmutte, Ian M. This paper explores the consequences for economic research of methods used by data publishers to protect the privacy of their respondents. We review the concept of statistical disclosure limitation for an audience of economists who may be unfamiliar with these methods. We characterize what it means for statistical disclosure limitation to be ignorable. When it is not ignorable, we consider the effects of statistical disclosure limitation for a variety of research designs common in applied economic research. Because statistical agencies do not always report the methods they use to protect confidentiality, we also characterize settings in which statistical disclosure limitation methods are discoverable; that is, they can be learned from the released data. We conclude with advice for researchers, journal editors, and statistical agencies.}, issn = {00072303}, url = {http://www.brookings.edu/about/projects/bpea/papers/2015/economic-analysis-statistical-disclosure-limitation}, author = {Abowd, John M. and Schmutte, Ian M.} } @article {2100, title = {The Effect of CATI Questionnaire Design Features on Response Timing}, journal = {Journal of Survey Statistics and Methodology}, volume = {3}, year = {2015}, pages = {361-396}, doi = {10.1093/jssam/smv021}, author = {Olson, K. and Smyth, J.D.} } @techreport {1990, title = {Effects of Census Accuracy on Apportionment of Congress and Allocations of Federal Funds.}, number = {WP-15-05}, year = {2015}, institution = {Northwestern University, Institute for Policy Research}, type = {Working Paper}, abstract = {

How much accuracy is needed in the 2020 census depends on the cost of attaining accuracy and on the consequences of imperfect accuracy. The cost target for the 2020 census of the United States has been specified, and the Census Bureau is developing projections of the accuracy attainable for that cost. It is desirable to have information about the consequences of the accuracy that might be attainable for that cost or for alternative cost levels. To assess the consequences of imperfect census accuracy, Seeskin and Spencer consider alternative profiles of accuracy for states and assess their implications for apportionment of the U.S. House of Representatives and for allocation of federal funds. An error in allocation is defined as the difference between the allocation computed under imperfect data and the allocation computed with perfect data. Estimates of expected sums of absolute values of errors are presented for House apportionment and for federal funds allocations.

}, url = {http://www.ipr.northwestern.edu/publications/papers/2015/ipr-wp-15-05.html}, author = {Seeskin, Zachary H. and Spencer, Bruce D.} } @conference {2108, title = {Effects of interviewer and respondent behavior on data quality: An investigation of question types and interviewer learning}, booktitle = {70th Annual Conference of the American Association for Public Opinion Research (AAPOR)}, year = {2015}, address = {Hollywood, Florida}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Kirchner, A. and Olson, K.} } @conference {2109, title = {Effects of interviewer and respondent behavior on data quality: An investigation of question types and interviewer learning}, booktitle = {6th Conference of the European Survey Research Association}, year = {2015}, month = {07/2015}, address = {Reykjavik, Iceland}, url = {http://www.europeansurveyresearch.org/conference}, author = {Kirchner, A. and Olson, K.} } @article {http://arxiv.org/abs/1508.05918, title = {An empirical comparison of multiple imputation methods for categorical data}, journal = {arXiv}, year = {2015}, abstract = {Multiple imputation is a common approach for dealing with missing values in statistical databases. The imputer fills in missing values with draws from predictive models estimated from the observed data, resulting in multiple, completed versions of the database. Researchers have developed a variety of default routines to implement multiple imputation; however, there has been limited research comparing the performance of these methods, particularly for categorical data. We use simulation studies to compare repeated sampling properties of three default multiple imputation methods for categorical data, including chained equations using generalized linear models, chained equations using classification and regression trees, and a fully Bayesian joint distribution based on Dirichlet Process mixture models. We base the simulations on categorical data from the American Community Survey. The results suggest that default chained equations approaches based on generalized linear models are dominated by the default regression tree and mixture model approaches. They also suggest competing advantages for the regression tree and mixture model approaches, making both reasonable default engines for multiple imputation of categorical data.}, url = {http://arxiv.org/abs/1508.05918}, author = {Akande, O. and Li, Fan and Reiter , J. P.} } @article {steorts2015, title = {Entity Resolution with Empirically Motivated Priors}, journal = {Bayesian Anal.}, volume = {10}, year = {2015}, month = {12}, pages = {849{\textendash}875}, abstract = {Databases often contain corrupted, degraded, and noisy data with duplicate entries across and within each database. Such problems arise in citations, medical databases, genetics, human rights databases, and a variety of other applied settings. The target of statistical inference can be viewed as an unsupervised problem of determining the edges of a bipartite graph that links the observed records to unobserved latent entities. Bayesian approaches provide attractive benefits, naturally providing uncertainty quantification via posterior probabilities. We propose a novel record linkage approach based on empirical Bayesian principles. Specifically, the empirical Bayesian-type step consists of taking the empirical distribution function of the data as the prior for the latent entities. This approach improves on the earlier HB approach not only by avoiding the prior specification problem but also by allowing both categorical and string-valued variables. Our extension to string-valued variables also involves the proposal of a new probabilistic mechanism by which observed record values for string fields can deviate from the values of their associated latent entities. Categorical fields that deviate from their corresponding true value are simply drawn from the empirical distribution function. We apply our proposed methodology to a simulated data set of German names and an Italian household survey on income and wealth, showing our method performs favorably compared to several standard methods in the literature. We also consider the robustness of our methods to changes in the hyper-parameters.}, doi = {10.1214/15-BA965SI}, url = {http://dx.doi.org/10.1214/15-BA965SI}, author = {Steorts, Rebecca C.} } @article {2198, title = {Entity resolution with empirically motivated priors}, journal = {Bayesian Analysis}, volume = {10}, year = {2015}, abstract = {Databases often contain corrupted, degraded, and noisy data with duplicate entries across and within each database. Such problems arise in citations, medical databases, genetics, human rights databases, and a variety of other applied settings. The target of statistical inference can be viewed as an unsupervised problem of determining the edges of a bipartite graph that links the observed records to unobserved latent entities. Bayesian approaches provide attractive benefits, naturally providing uncertainty quantification via posterior probabilities. We propose a novel record linkage approach based on empirical Bayesian principles. Specifically, the empirical Bayesian--type step consists of taking the empirical distribution function of the data as the prior for the latent entities. This approach improves on the earlier HB approach not only by avoiding the prior specification problem but also by allowing both categorical and string-valued variables. Our extension to string-valued variables also involves the proposal of a new probabilistic mechanism by which observed record values for string fields can deviate from the values of their associated latent entities. Categorical fields that deviate from their corresponding true value are simply drawn from the empirical distribution function. We apply our proposed methodology to a simulated data set of German names and an Italian household survey, showing our method performs favorably compared to several standard methods in the literature. We also consider the robustness of our methods to changes in the hyper-parameters.}, doi = {10.1214/15-BA965SI}, url = {http://projecteuclid.org/euclid.ba/1441790411}, author = {Steorts, Rebecca C.} } @mastersthesis {2269, title = {Essays on Multinational Production and the Propagation of Shocks}, year = {2015}, school = {University of Michigan}, type = {Ph.D.}, address = {Ann Arbor, MI}, abstract = {The increased exposure of the United States to economic shocks originating from abroad is a common concern of those critical of globalization. An understanding of the cross-country transmission of shocks is of central importance for policymakers seeking to limit excess volatility resulting from international linkages. Firms whose ownership spans multiple countries are one under-appreciated mechanism. These multinationals represent an enormous share of the global economy, but a general scarcity of firm-level data has limited our understanding of how they affect both origin and destination countries. One contribution of this dissertation is to expand the data availability on these firms, using innovative data-linking techniques. The first chapter provides some of the first ever causal evidence on the role of trade and multinational production in the transmission of economic shocks and the cross-country synchronization of business cycles. This chapter leverages the 2011 Japanese earthquake/tsunami as a natural experiment. It finds that those U.S. firms with large exposure to intermediate inputs from Japan -- typically the affiliates of Japanese multinationals -- experience significant output declines after this shock, roughly one-for-one with declines in imported inputs. Structural estimation of the production function reveals substantial complementarities between imported and domestic inputs. These results suggest that global supply chains are more rigid than previously thought. The second chapter incorporates this low production elasticity of imported inputs into an otherwise standard dynamic stochastic general equilibrium model. The low degree of input substitutability, when applied to the share of trade governed by multinational firms, can generate effects in the aggregate. Value-added co-movement increases by 11 percentage points in the baseline model relative to a model where such features are absent. The model confirms that real linkages -- in addition to financial and policy spillovers -- play an important role in business cycle synchronization. The third chapter describes additional characteristics of multinational firms relative to domestic and exporting firms in the U.S. economy. These firms are larger, more productive, more capital intensive, and pay higher wages than other firms. The relative patterns of trade and output offer valuable guidance for the motives for ownership that spans national boundaries.}, keywords = {Business Cycle Comovement, Global Supply Chains, Multinational Firms}, url = {http://hdl.handle.net/2027.42/111331}, author = {Flaaen, Aaron} } @inbook {1740, title = {Evaluation of diagnostics for hierarchical spatial statistical models}, booktitle = {Geometry Driven Statistics}, year = {2015}, pages = {241-256}, publisher = {Wiley}, organization = {Wiley}, edition = {1}, chapter = {12}, address = {Chinchester}, isbn = {978-1118866573}, issn = {978-1118866573}, url = {http://niasra.uow.edu.au/content/groups/public/@web/@inf/@math/documents/doc/uow169240.pdf}, author = {Cressie, N. and Burden, S.}, editor = {I.L. Dryden and J.T. Kent} } @article {1824, title = {Expanding the Discourse on Antipoverty Policy: Reconsidering a Negative Income Tax}, journal = {Journal of Poverty}, volume = {19}, year = {2015}, month = {02/2015}, pages = {218-238}, abstract = {This article proposes that advocates for the poor consider the replacement of the current means-tested safety net in the United States with a Negative Income Tax (NIT), a guaranteed income program that lifts families{\textquoteright} incomes above a minimum threshold. The article highlights gaps in service provision that leave millions in poverty, explains how a NIT could help fill those gaps, and compares current expenditures on major means-tested programs to estimated expenditures necessary for a NIT. Finally, it addresses the financial and political concerns that are likely to arise in the event that a NIT proposal gains traction among policy makers.}, keywords = {economic well-being, poverty alleviation, public policy, social welfare policy}, doi = {10.1080/10875549.2014.991889}, url = {http://dx.doi.org/10.1080/10875549.2014.991889}, author = {Jessica Wiederspan and Elizabeth Rhodes and H. Luke Shaefer} } @article {2077, title = {Figures of merit for simultaneous inference and comparisons in simulation experiments}, journal = {Stat}, volume = {4}, year = {2015}, month = {08/2015}, pages = {196-211}, chapter = {196}, doi = {10.1002/sta4.88}, url = {http://onlinelibrary.wiley.com/doi/10.1002/sta4.88/epdf}, author = {Cressie, N. and Burden, S.} } @mastersthesis {2270, title = {Four Essays in Unemployment, Wage Dynamics and Subjective Expectations}, year = {2015}, school = {University of Michigan}, type = {Ph.D.}, address = {Ann Arbor, MI}, abstract = {This dissertation contains four essays on unemployment differences between skill groups, on the effect of non-employment on wages and measurement error, and on subjective expectations of Americans about mortality and the stock market. Chapter 1 tests how much of the unemployment rate differences between education groups can be explained by occupational differences in labor adjustment costs. The educational gap in unemployment is substantial. Recent empirical studies found that the largest component of labor adjustment costs are adaptation costs: newly hired workers need a few month get up to speed and reach full productivity. The chapter evaluates the effect of adaptation costs on unemployment using a calibrated search and matching model. Chapter 2 tests how short periods of non-employment affect survey reports of annual earnings. Non-employment has strong and non-standard effects on response error in earnings. Persons tend to report the permanent component of their earnings accurately, but transitory shocks are underreported. Transitory shocks due to career interruptions are very large, taking up several month of lost earnings, on average, and people only report 60-85\% percent of these earnings losses. The resulting measurement error is non-standard: it has a positive mean, it is right-skewed, and the bias correlates with predictors of turnover. Chapter 3 proposes and tests a model, the modal response hypothesis, to explain patterns in mortality expectations of Americans. The model is a mathematical expression of the idea that survey responses of 0\%, 50\%, or 100\% to probability questions indicate a high level of uncertainty about the relevant probability. The chapter shows that subjective survival expectations in 2002 line up very well with realized mortality of the HRS respondents between 2002 and 2010 and our model performs better than typically used models in the literature of subjective probabilities. Chapter 4 analyzes the impact of the stock market crash of 2008 on households{\textquoteright} expectations about the returns on the stock market index: the population average of expectations, the average uncertainty, and the cross-sectional heterogeneity in expectations from March 2008 to February 2009.}, keywords = {measurement error, subjective expectations, unemployment}, url = {http://hdl.handle.net/2027.42/113598}, author = {Hudomiet, Peter} } @conference {2119, title = {Grids and Online Panels: A Comparison of Device Type from a Survey Quality Perspective}, booktitle = {70th Annual Conference of the American Association for Public Opinion Research (AAPOR)}, year = {2015}, address = {Hollywood, Florida}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Wang, Mengyang and McCutcheon, Allan L. and Allen, Laura} } @techreport {2417, title = {he role of occupation specific adaptation costs in explaining the educational gap in unemployment.}, year = {2015}, type = {Mimeo}, url = {https://sites.google.com/site/phudomiet/Hudomiet-JobMarketPaper.pdf?attredirects=0}, author = {Hudomiet, Peter} } @inbook {2092, title = {Hierarchcial models for uncertainty quantification: An overview}, booktitle = {Handbook of Uncertainty Quantification}, year = {2015}, publisher = {Springer}, organization = {Springer}, issn = {978-3-319-12384-4}, author = {Wikle, C.K.}, editor = {Ghanem, R. and Higdon, D. and Owhadi, H.} } @inbook {WikleHooten2015, title = {Hierarchical Agent-Based Spatio-Temporal Dynamic Models for Discrete Valued Data}, booktitle = {Handbook of Discrete-Valued Time Series}, year = {2015}, publisher = {Chapman and Hall/CRC Press}, organization = {Chapman and Hall/CRC Press}, chapter = {Hierarchical Agent-Based Spatio-Temporal Dynamic Models for Discrete Valued Data}, address = {Boca Raton, FL.}, issn = {9781466577732}, url = {http://www.crcpress.com/product/isbn/9781466577732}, author = {Wikle, C.K. and Hooten, M.B.}, editor = {Davis, R. and Holan, S. and Lund, R. and Ravishanker, N.} } @inbook {HolanWikle, title = {Hierarchical Dynamic Generalized Linear Mixed Models for Discrete-Valued Spatio-Temporal Data}, booktitle = {Handbook of Discrete-Valued Time Series}, year = {2015}, note = {to appear in "Handbook of Discrete-Valued Time Series}, publisher = {Chapman and Hall/CRC Press}, organization = {Chapman and Hall/CRC Press}, address = {Boca Raton, FL}, isbn = {ISBN 9781466577732}, url = {http://www.crcpress.com/product/isbn/9781466577732}, author = {Holan, S.H. and Wikle, C.K.}, editor = {Davis, R. and Holan, S. and Lund, R. and Ravishanker, N} } @inbook {1879, title = {Hierarchical Dynamic Generalized Linear Mixed Models for Discrete--Valued Spatio-Temporal Data}, booktitle = {Handbook of Discrete--Valued Time Series}, year = {2015}, author = {Holan, S.H. and Wikle, C.K.} } @inbook {2093, title = {Hierarchical Spatial Models}, booktitle = {Encyclopedia of Geographical Information Science}, year = {2015}, publisher = {Springer}, organization = {Springer}, author = {Arab, A. and Hooten, M.B. and Wikle, C.K.} } @article {2090, title = {Hierarchical, stochastic modeling across spatiotemporal scales of large river ecosystems and somatic growth in fish populations under various climate models: Missouri River sturgeon example}, journal = {Geological Society}, year = {2015}, author = {Wildhaber, M.L. and Wikle, C.K. and Moran, E.H. and Anderson, C.J. and Franz, K.J. and Dey, R.} } @article {2078, title = {Hot enough for you? A spatial exploratory and inferential analysis of North American climate-change projections}, journal = {Mathematical Geosciences}, year = {2015}, issn = {1874-8961}, doi = {10.1007/s11004-015-9607-9}, url = {http://dx.doi.org/10.1007/s11004-015-9607-9}, author = {Cressie, N. and Kang, E.L.} } @techreport {gelman2015individuals, title = {How individuals smooth spending: Evidence from the 2013 government shutdown using account data}, year = {2015}, institution = {National Bureau of Economic Research}, abstract = {Using comprehensive account records, this paper examines how individuals adjusted spending and saving in response to a temporary drop in income due to the 2013 U.S. government shutdown. The shutdown cut paychecks by 40\% for affected employees, which was recovered within 2 weeks. Though the shock was short-lived and completely reversed, spending dropped sharply implying a na{\"\i}ve estimate of the marginal propensity to spend of 0.58. This estimate overstates how consumption responded. While many individuals had low liquidity, they used multiple strategies to smooth consumption including delay of recurring payments such as mortgages and credit card balances.}, author = {Gelman, Michael and Kariv, Shachar and Shapiro, Matthew D and Silverman, Dan and Tadelis, Steven} } @conference {2103, title = {I Know What You Did Next: Predicting Respondent{\textquoteright}s Next Activity Using Machine Learning}, booktitle = {70th Annual Conference of the American Association for Public Opinion Research (AAPOR)}, year = {2015}, month = {May 14-17, 2015}, address = {Hollywood, Florida}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Arunachalam, H. and Atkin, G. and Eck, A. and Wettlaufer, D. and Soh, L.-K. and Belli, R.F.} } @techreport {handle:1813:40169, title = {Introduction to The Survey of Income and Program Participation (SIPP)}, number = {1813:40169}, year = {2015}, institution = {University of Michigan}, type = {Preprint}, abstract = {Introduction to The Survey of Income and Program Participation (SIPP) Shaefer, H. Luke Goals for the SIPP Workshop Provide you with an introduction to the SIPP and get you up and running on the public-use SIPP files, offer some advanced tools for 2008 Panel SIPP data analysis, Get you some experience analyzing SIPP data, Introduce you to the SIPP EHC (SIPP Redesign), Introduce you to the SIPP Synthetic Beta (SSB) Presentation made on May 15, 2015 at the Census Bureau, and previously in 2014 at Duke University and University of Michigan}, url = {http://hdl.handle.net/1813/40169}, author = {Shaefer, H. Luke} } @inbook {Lund, title = {Long Memory Discrete--Valued Time Series}, booktitle = {Handbook of Discrete-Valued Time Series}, year = {2015}, publisher = {Chapman and Hall}, organization = {Chapman and Hall}, chapter = {Long Memoriy Discrete-Valued Time Series}, url = {http://www.crcpress.com/product/isbn/9781466577732}, author = {Lund, R. and Holan, S.H. and Livsey, J.} } @techreport {handle:1813:40306, title = {Modeling Endogenous Mobility in Wage Determination}, number = {1813:40306}, year = {2015}, institution = {Cornell University}, type = {Preprint}, abstract = {Modeling Endogenous Mobility in Wage Determination Abowd, John M.; McKinney, Kevin L.; Schmutte, Ian M. We evaluate the bias from endogenous job mobility in fixed-effects estimates of worker- and firm-specific earnings heterogeneity using longitudinally linked employer-employee data from the LEHD infrastructure file system of the U.S. Census Bureau. First, we propose two new residual diagnostic tests of the assumption that mobility is exogenous to unmodeled determinants of earnings. Both tests reject exogenous mobility. We relax the exogenous mobility assumptions by modeling the evolution of the matched data as an evolving bipartite graph using a Bayesian latent class framework. Our results suggest that endogenous mobility biases estimated firm effects toward zero. To assess validity, we match our estimates of the wage components to out-of-sample estimates of revenue per worker. The corrected estimates attribute much more of the variation in revenue per worker to variation in match quality and worker quality than the uncorrected estimates.}, url = {http://hdl.handle.net/1813/40306}, author = {Abowd, John M. and McKinney, Kevin L. and Schmutte, Ian M.} } @techreport {handle:1813:52608, title = {Modeling Endogenous Mobility in Wage Determination}, number = {1813:52608}, year = {2015}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {Modeling Endogenous Mobility in Wage Determination Abowd, John M.; McKinney, Kevin L.; Schmutte, Ian M. We evaluate the bias from endogenous job mobility in fixed-effects estimates of worker- and firm-specific earnings heterogeneity using longitudinally linked employer-employee data from the LEHD infrastructure file system of the U.S. Census Bureau. First, we propose two new residual diagnostic tests of the assumption that mobility is exogenous to unmodeled determinants of earnings. Both tests reject exogenous mobility. We relax exogenous mobility by modeling the matched data as an evolving bipartite graph using a Bayesian latent-type framework. Our results suggest that allowing endogenous mobility increases the variation in earnings explained by individual heterogeneity and reduces the proportion due to employer and match effects. To assess external validity, we match our estimates of the wage components to out-ofsample estimates of revenue per worker. The mobility-bias corrected estimates attribute much more of the variation in revenue per worker to variation in match quality and worker quality than the uncorrected estimates.}, url = {http://hdl.handle.net/1813/52608}, author = {Abowd, John M. and McKinney, Kevin L. and Schmutte, Ian M.} } @techreport {2015arXiv150701242D, title = {{Modeling for Dynamic Ordinal Regression Relationships: An Application to Estimating Maturity of Rockfish in California}}, number = {1507.01242}, year = {2015}, institution = {ArXiv}, abstract = {We develop a Bayesian nonparametric framework for modeling ordinal regression relationships which evolve in discrete time. The motivating application involves a key problem in fisheries research on estimating dynamically evolving relationships between age, length and maturity, the latter recorded on an ordinal scale. The methodology builds from nonparametric mixture modeling for the joint stochastic mechanism of covariates and latent continuous responses. This approach yields highly flexible inference for ordinal regression functions while at the same time avoiding the computational challenges of parametric models. A novel dependent Dirichlet process prior for time-dependent mixing distributions extends the model to the dynamic setting. The methodology is used for a detailed study of relationships between maturity, age, and length for Chilipepper rockfish, using data collected over 15 years along the coast of California.}, keywords = {Statistics - Applications}, url = {http://arxiv.org/abs/1507.01242}, author = {DeYoreo, M. and Kottas, A.} } @article {Wikletoappear, title = {Modern Perspectives on Statistics for Spatio-Temporal Data}, journal = {WIRES Computational Statistics}, volume = {7}, year = {2015}, pages = {86-98}, issn = {1939-0068}, doi = {10.1002/wics.1341}, url = {http://dx.doi.org/10.1002/wics.1341}, author = {Wikle, C.K.} } @article {fie:2015, title = {Moving Toward the New World of Censuses and Large-Scale Sample Surveys: Methodological Developments and Practical Implementations}, journal = {Journal of Official Statistics}, year = {2015}, note = {In press}, author = {Fienberg, S. E.} } @article {2019, title = {Multiple imputation for harmonizing longitudinal non-commensurate measures in individual participant data meta-analysis}, journal = {Statistics in Medicine}, year = {2015}, doi = {10.1002/sim.6562}, url = {http://onlinelibrary.wiley.com/doi/10.1002/sim.6562/abstract}, author = {Siddique, J. and Reiter, J. P. and Brincks, A. and Gibbons, R. and Crespi, C. and Brown, C. H.} } @article {mur:rei:2015, title = {Multiple Imputation of Missing Categorical and Continuous Values via Bayesian Mixture Models with Local Dependence}, journal = {arXiv}, year = {2015}, abstract = {We present a nonparametric Bayesian joint model for multivariate continuous and categorical variables, with the intention of developing a flexible engine for multiple imputation of missing values. The model fuses Dirichlet process mixtures of multinomial distributions for categorical variables with Dirichlet process mixtures of multivariate normal distributions for continuous variables. We incorporate dependence between the continuous and categorical variables by (i) modeling the means of the normal distributions as component-specific functions of the categorical variables and (ii) forming distinct mixture components for the categorical and continuous data with probabilities that are linked via a hierarchical model. This structure allows the model to capture complex dependencies between the categorical and continuous data with minimal tuning by the analyst. We apply the model to impute missing values due to item nonresponse in an evaluation of the redesign of the Survey of Income and Program Participation (SIPP). The goal is to compare estimates from a field test with the new design to estimates from selected individuals from a panel collected under the old design. We show that accounting for the missing data changes some conclusions about the comparability of the distributions in the two datasets. We also perform an extensive repeated sampling simulation using similar data from complete cases in an existing SIPP panel, comparing our proposed model to a default application of multiple imputation by chained equations. Imputations based on the proposed model tend to have better repeated sampling properties than the default application of chained equations in this realistic setting.}, url = {arxiv.org/abs/1410.0438}, author = {Murray, J. S. and Reiter, J. P.} } @article {1882, title = {Multiscale Analysis of Survey Data: Recent Developments and Exciting Prospects}, journal = {Statistics Views}, year = {2015}, author = {Bradley, J.R. and Wikle, C.K. and Holan, S.H.} } @article {2671, title = {Multivariate Spatial Covariance Models: A Conditional Approach}, year = {2015}, abstract = {Multivariate geostatistics is based on modelling all covariances between all possible combinations of two or more variables at any sets of locations in a continuously indexed domain. Multivariate spatial covariance models need to be built with care, since any covariance matrix that is derived from such a model must be nonnegative-definite. In this article, we develop a conditional approach for spatial-model construction whose validity conditions are easy to check. We start with bivariate spatial covariance models and go on to demonstrate the approach{\textquoteright}s connection to multivariate models defined by networks of spatial variables. In some circumstances, such as modelling respiratory illness conditional on air pollution, the direction of conditional dependence is clear. When it is not, the two directional models can be compared. More generally, the graph structure of the network reduces the number of possible models to compare. Model selection then amounts to finding possible causative links in the network. We demonstrate our conditional approach on surface temperature and pressure data, where the role of the two variables is seen to be asymmetric.}, url = {https://arxiv.org/abs/1504.01865}, author = {Cressie, N. and Zammit-Mangion, A.} } @article {2089, title = {Multivariate Spatial Hierarchical Bayesian Empirical Likelihood Methods for Small Area Estimation}, journal = {STAT}, volume = {4}, year = {2015}, month = {05/2015}, pages = {108-116}, issn = {2049-1573}, doi = {10.1002/sta4.81}, url = {http://dx.doi.org/10.1002/sta4.81}, author = {Porter, A.T. and Holan, S.H. and Wikle, C.K.} } @article {2015arXiv:1503.00982, title = {Multivariate Spatio-Temporal Models for High-Dimensional Areal Data with Application to Longitudinal Employer-Household Dynamics}, journal = {ArXiv}, year = {2015}, abstract = {Many data sources report related variables of interest that are also referenced over geographic regions and time; however, there are relatively few general statistical methods that one can readily use that incorporate these multivariate spatio-temporal dependencies. Additionally, many multivariate spatio-temporal areal datasets are extremely high-dimensional, which leads to practical issues when formulating statistical models. For example, we analyze Quarterly Workforce Indicators (QWI) published by the US Census Bureau{\textquoteright}s Longitudinal Employer-Household Dynamics (LEHD) program. QWIs are available by different variables, regions, and time points, resulting in millions of tabulations. Despite their already expansive coverage, by adopting a fully Bayesian framework, the scope of the QWIs can be extended to provide estimates of missing values along with associated measures of uncertainty. Motivated by the LEHD, and other applications in federal statistics, we introduce the multivariate spatio-temporal mixed effects model (MSTM), which can be used to efficiently model high-dimensional multivariate spatio-temporal areal datasets. The proposed MSTM extends the notion of Moran{\textquoteright}s I basis functions to the multivariate spatio-temporal setting. This extension leads to several methodological contributions including extremely effective dimension reduction, a dynamic linear model for multivariate spatio-temporal areal processes, and the reduction of a high-dimensional parameter space using {a novel} parameter model.}, url = {http://arxiv.org/abs/1503.00982}, author = {Bradley, J.~R. and Holan, S.~H. and Wikle, C.K.} } @article {2169, title = {Multivariate Spatio-Temporal Models for High-Dimensional Areal Data with Application to Longitudinal Employer-Household Dynamics}, journal = {Annals of Applied Statistics}, volume = {9}, year = {2015}, month = {03/2015}, abstract = {Many data sources report related variables of interest that are also referenced over geographic regions and time; however, there are relatively few general statistical methods that one can readily use that incorporate these multivariate spatio-temporal dependencies. Additionally, many multivariate spatio-temporal areal datasets are extremely high-dimensional, which leads to practical issues when formulating statistical models. For example, we analyze Quarterly Workforce Indicators (QWI) published by the US Census Bureau{\textquoteright}s Longitudinal Employer-Household Dynamics (LEHD) program. QWIs are available by different variables, regions, and time points, resulting in millions of tabulations. Despite their already expansive coverage, by adopting a fully Bayesian framework, the scope of the QWIs can be extended to provide estimates of missing values along with associated measures of uncertainty. Motivated by the LEHD, and other applications in federal statistics, we introduce the multivariate spatio-temporal mixed effects model (MSTM), which can be used to efficiently model high-dimensional multivariate spatio-temporal areal datasets. The proposed MSTM extends the notion of Moran{\textquoteright}s I basis functions to the multivariate spatio-temporal setting. This extension leads to several methodological contributions including extremely effective dimension reduction, a dynamic linear model for multivariate spatio-temporal areal processes, and the reduction of a high-dimensional parameter space using a novel parameter model.}, doi = {0.1214/15-AOAS862}, author = {Bradley, J.R. and Holan, S.H. and Wikle, C.K.} } @techreport {handle:1813:45822, title = {NCRN Meeting Fall 2016: Dynamic Question Ordering: Obtaining Useful Information While Reducing Burden}, number = {1813:45822}, year = {2015}, institution = {Carnegie-Mellon University}, type = {Preprint}, abstract = {NCRN Meeting Fall 2016: Dynamic Question Ordering: Obtaining Useful Information While Reducing Burden Early, Kirstin}, url = {http://hdl.handle.net/1813/45822}, author = {Early, Kirstin} } @techreport {handle:1813:45867, title = {NCRN Meeting Spring 2015}, number = {1813:45867}, year = {2015}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2015 Vilhuber, Lars May 7 meetings @ U.S. Census Bureau, Washington DC.}, url = {http://hdl.handle.net/1813/45867}, author = {Vilhuber, Lars} } @article {handle:1813:40181, title = {NCRN Meeting Spring 2015: A Vision for the Future of Data Access}, number = {1813:40181}, year = {2015}, publisher = {NCRN Coordinating Office}, type = {Preprint}, abstract = {

NCRN Meeting Spring 2015: A Vision for the Future of Data Access Reiter, J.P. Presentation at the NCRN Meeting Spring 2015

}, url = {http://hdl.handle.net/1813/40181}, author = {Reiter, J.P.} } @article {handle:1813:40185, title = {NCRN Meeting Spring 2015: Broadening data access through synthetic data}, number = {1813:40185}, year = {2015}, publisher = {NCRN Coordinating Office}, type = {Preprint}, abstract = {

NCRN Meeting Spring 2015: Broadening data access through synthetic data Vilhuber, Lars Presentation at the NCRN Meeting Spring 2015

}, url = {http://hdl.handle.net/1813/40185}, author = {Vilhuber, Lars} } @techreport {handle:1813:40188, title = {NCRN Meeting Spring 2015: Building and Training the Next Generation of Survey Methodologists and Researchers}, number = {1813:40188}, year = {2015}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2015: Building and Training the Next Generation of Survey Methodologists and Researchers Nugent, Rebecca Presentation at the NCRN Meetings Spring 2015}, url = {http://hdl.handle.net/1813/40188}, author = {Nugent, Rebecca} } @techreport {handle:1813:40186, title = {NCRN Meeting Spring 2015: Can Government-Academic Partnerships Help Secure the Future of the Federal Statistical System? Examples from the NSF-Census Research Network}, number = {1813:40186}, year = {2015}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2015: Can Government-Academic Partnerships Help Secure the Future of the Federal Statistical System? Examples from the NSF-Census Research Network Abowd, John M.; Fienberg, Stephen E. May 8, 2015 CNSTAT Public Seminar}, url = {http://hdl.handle.net/1813/40186}, author = {Abowd, John M. and Fienberg, Stephen E.} } @techreport {handle:1813:40187, title = {NCRN Meeting Spring 2015: Comment on: Can Government-Academic Partnerships Help Secure the Future of the Federal Statistical System? Examples from the NSF-Census Research Network}, number = {1813:40187}, year = {2015}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2015: Comment on: Can Government-Academic Partnerships Help Secure the Future of the Federal Statistical System? Examples from the NSF-Census Research Network Groshen, Erica L. Public Seminar Presentation by Erica L. Groshen at the Spring 2015 NCRN/CNSTAT Meetings}, url = {http://hdl.handle.net/1813/40187}, author = {Groshen, Erica L.} } @techreport {handle:1813:40182, title = {NCRN Meeting Spring 2015: Geographic Aspects of Direct and Indirect Estimators for Small Area Estimation}, number = {1813:40182}, year = {2015}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2015: Geographic Aspects of Direct and Indirect Estimators for Small Area Estimation Nagle, Nicholas Presentation at the NCRN Meeting Spring 2015}, url = {http://hdl.handle.net/1813/40182}, author = {Nagle, Nicholas} } @techreport {handle:1813:40183, title = {NCRN Meeting Spring 2015: Geography and Usability of the American Community Survey}, number = {1813:40183}, year = {2015}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2015: Geography and Usability of the American Community Survey Spielman, Seth Presentation at the NCRN Meeting Spring 2015}, url = {http://hdl.handle.net/1813/40183}, author = {Spielman, Seth} } @techreport {handle:1813:40176, title = {NCRN Meeting Spring 2015: Models for Multiscale Spatially-Referenced Count Data}, number = {1813:40176}, year = {2015}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2015: Models for Multiscale Spatially-Referenced Count Data Holan, Scott; Bradley, Jonathan R.; Wikle, Christopher K. Presentation at the NCRN Meeting Spring 2015}, url = {http://hdl.handle.net/1813/40176}, author = {Holan, Scott and Bradley, Jonathan R. and Wikle, Christopher K.} } @techreport {handle:1813:40177, title = {NCRN Meeting Spring 2015: Regionalization of Multiscale Spatial Processes Using a Criterion for Spatial Aggregation Error}, number = {1813:40177}, year = {2015}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2015: Regionalization of Multiscale Spatial Processes Using a Criterion for Spatial Aggregation Error Wikle, Christopher K.; Bradley, Jonathan; Holan, Scott Develop and implement a statistical criterion to diagnose spatial aggregation error that can facilitate the choice of regionalizations of spatial data. Presentation at NCRN Meeting Spring 2015}, url = {http://hdl.handle.net/1813/40177}, author = {Wikle, Christopher K. and Bradley, Jonathan and Holan, Scott} } @techreport {handle:1813:40184, title = {NCRN Meeting Spring 2015: Revisiting the Economics of Privacy: Population Statistics and Confidentiality Protection as Public Goods}, number = {1813:40184}, year = {2015}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2015: Revisiting the Economics of Privacy: Population Statistics and Confidentiality Protection as Public Goods Abowd, John M.; Schmutte, Ian Presentation at the NCRN Meeting Spring 2015}, url = {http://hdl.handle.net/1813/40184}, author = {Abowd, John M. and Schmutte, Ian} } @techreport {handle:1813:40309, title = {NCRN Meeting Spring 2015: Survey Informatics: The Future of Survey Methodology and Survey Statistics Training in the Academy?}, number = {1813:40309}, year = {2015}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {

NCRN Meeting Spring 2015: Survey Informatics: The Future of Survey Methodology and Survey Statistics Training in the Academy? McCutcheon, Allan L. Presentation at the NCRN Meeting Spring 2015

}, url = {http://hdl.handle.net/1813/40309}, author = {McCutcheon, Allan L.} } @techreport {handle:1813:40179, title = {NCRN Meeting Spring 2015: Training Undergraduates, Graduate Students, Postdocs, and Federal Agencies: Methodology, Data, and Science for Federal Statistics}, number = {1813:40179}, year = {2015}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2015: Training Undergraduates, Graduate Students, Postdocs, and Federal Agencies: Methodology, Data, and Science for Federal Statistics Cressie, Noel; Holan, Scott H.; Wikle, Christopher K. Presentation at the NCRN Spring 2015 Meeting}, url = {http://hdl.handle.net/1813/40179}, author = {Cressie, Noel and Holan, Scott H. and Wikle, Christopher K.} } @techreport {handle:1813:40193, title = {NCRN Newsletter: Volume 2 - Issue 1}, number = {1813:40193}, year = {2015}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Newsletter: Volume 2 - Issue 1 Vilhuber, Lars; Karr, Alan; Reiter, Jerome; Abowd, John; Nunnelly, Jamie Overview of activities at NSF-Census Research Network nodes from October 2014 to January 2015. NCRN Newsletter Vol. 2, Issue 1: January 30, 2015.}, url = {http://hdl.handle.net/1813/40193}, author = {Vilhuber, Lars and Karr, Alan and Reiter, Jerome and Abowd, John and Nunnelly, Jamie} } @techreport {handle:1813:40194, title = {NCRN Newsletter: Volume 2 - Issue 2}, number = {1813:40194}, year = {2015}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Newsletter: Volume 2 - Issue 2 Vilhuber, Lars; Karr, Alan; Reiter, Jerome; Abowd, John; Nunnelly, Jamie Overview of activities at NSF-Census Research Network nodes from January 2015 to May 2015. NCRN Newsletter Vol. 2, Issue 2: May 12, 2015.}, url = {http://hdl.handle.net/1813/40194}, author = {Vilhuber, Lars and Karr, Alan and Reiter, Jerome and Abowd, John and Nunnelly, Jamie} } @techreport {handle:1813:44200, title = {NCRN Newsletter: Volume 2 - Issue 2}, number = {1813:44200}, year = {2015}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Newsletter: Volume 2 - Issue 2 Vilhuber, Lars; Karr, Alan; Reiter, Jerome; Abowd, John; Nunnelly, Jamie Overview of activities at NSF-Census Research Network nodes from February 2015 to May 2015. NCRN Newsletter Vol. 2, Issue 2: May 12, 2015.}, url = {http://hdl.handle.net/1813/44200}, author = {Vilhuber, Lars and Karr, Alan and Reiter, Jerome and Abowd, John and Nunnelly, Jamie} } @techreport {handle:1813:42393, title = {NCRN Newsletter: Volume 2 - Issue 3}, number = {1813:42393}, year = {2015}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {

NCRN Newsletter: Volume 2 - Issue 3 Vilhuber, Lars; Karr, Alan; Reiter, Jerome; Abowd, John; Nunnelly, Jamie Overview of activities at NSF-Census Research Network nodes from June 2015 through August 2015. NCRN Newsletter Vol. 2, Issue 3: September 15, 2015.

}, url = {http://hdl.handle.net/1813/42393}, author = {Vilhuber, Lars and Karr, Alan and Reiter, Jerome and Abowd, John and Nunnelly, Jamie} } @techreport {handle:1813:42338, title = {Noise Infusion as a Confidentiality Protection Measure for Graph-Based Statistics}, number = {1813:42338}, year = {2015}, institution = {Cornell University}, type = {Preprint}, abstract = {Noise Infusion as a Confidentiality Protection Measure for Graph-Based Statistics Abowd, John A.; McKinney, Kevin L. We use the bipartite graph representation of longitudinally linked employer-employee data, and the associated projections onto the employer and employee nodes, respectively, to characterize the set of potential statistical summaries that the trusted custodian might produce. We consider noise infusion as the primary confidentiality protection method. We show that a relatively straightforward extension of the dynamic noise-infusion method used in the U.S. Census Bureau{\textquoteright}s Quarterly Workforce Indicators can be adapted to provide the same confidentiality guarantees for the graph-based statistics: all inputs have been modified by a minimum percentage deviation (i.e., no actual respondent data are used) and, as the number of entities contributing to a particular statistic increases, the accuracy of that statistic approaches the unprotected value. Our method also ensures that the protected statistics will be identical in all releases based on the same inputs.}, url = {http://hdl.handle.net/1813/42338}, author = {Abowd, John A. and McKinney, Kevin L.} } @article {http://arxiv.org/abs/1508.03758, title = {Nonparametric Bayesian models with focused clustering for mixed ordinal and nominal data}, journal = {ArXiV}, year = {2015}, publisher = {arXiv}, abstract = {Dirichlet process mixtures can be useful models of multivariate categorical data and effective tools for multiple imputation of missing categorical values. In some contexts, however, these models can fit certain variables well at the expense of others in ways beyond the analyst{\textquoteright}s control. For example, when the data include some variables with non-trivial amounts of missing values, the mixture model may fit the marginal distributions of the nearly and fully complete variables at the expense of the variables with high fractions of missing data. Motivated by this setting, we present a Dirichlet process mixture model for mixed ordinal and nominal data that allows analysts to split variables into two groups: focus variables and remainder variables. The model uses three sets of clusters, one set for ordinal focus variables, one for nominal focus variables, and one for all remainder variables. The model uses a multivariate ordered probit specification for the ordinal variables and independent multinomial kernels for the nominal variables. The three sets of clusters are linked using an infinite tensor factorization prior, as well as via dependence of the means of the latent continuous focus variables on the remainder variables. This effectively specifies a rich, complex model for the focus variables and a simpler model for remainder variables, yet still potentially captures associations among the variables. In the multiple imputation context, focus variables include key variables with high rates of missing values, and remainder variables include variables without much missing data. Using simulations, we illustrate advantages and limitations of using focused clustering compared to mixture models that do not distinguish variables. We apply the model to handle missing values in an analysis of the 2012 American National Election Study.}, url = {http://arxiv.org/abs/1508.03758}, author = {DeYoreo, Maria and Reiter , J. P. and Hillygus, D. S.} } @article {deyoreo:reiter:hillygus, title = {Nonparametric Bayesian models with focused clustering for mixed ordinal and nominal data}, journal = {Bayesian Analysis}, year = {2015}, month = {08/2015}, abstract = {Dirichlet process mixtures can be useful models of multivariate categorical data and effective tools for multiple imputation of missing categorical values. In some contexts, however, these models can fit certain variables well at the expense of others in ways beyond the analyst{\textquoteright}s control. For example, when the data include some variables with non-trivial amounts of missing values, the mixture model may fit the marginal distributions of the nearly and fully complete variables at the expense of the variables with high fractions of missing data. Motivated by this setting, we present a Dirichlet process mixture model for mixed ordinal and nominal data that allows analysts to split variables into two groups: focus variables and remainder variables. The model uses three sets of clusters, one set for ordinal focus variables, one for nominal focus variables, and one for all remainder variables. The model uses a multivariate ordered probit specification for the ordinal variables and independent multinomial kernels for the nominal variables. The three sets of clusters are linked using an infinite tensor factorization prior, as well as via dependence of the means of the latent continuous focus variables on the remainder variables. This effectively specifies a rich, complex model for the focus variables and a simpler model for remainder variables, yet still potentially captures associations among the variables. In the multiple imputation context, focus variables include key variables with high rates of missing values, and remainder variables include variables without much missing data. Using simulations, we illustrate advantages and limitations of using focused clustering compared to mixture models that do not distinguish variables. We apply the model to handle missing values in an analysis of the 2012 American National Election Study.}, doi = {10.1214/16-BA1020}, author = {M. De Yoreo and J. P. Reiter and D. S. Hillygus} } @article {1737, title = {A nonparametric, multiple imputation-based method for the retrospective integration of data sets}, journal = {Multivariate Behavioral Research}, volume = {50}, year = {2015}, pages = {383-397}, chapter = {383}, doi = {10.1080/00273171.2015.1022641}, url = {http://www.tandfonline.com/doi/full/10.1080/00273171.2015.1022641}, author = {M.M. Carrig and D. Manrique-Vallier and K. Ranby and J.P. Reiter and R. Hoyle} } @article {2414, title = {Perceptions, behaviors and satisfaction related to public safety for persons with disabilities in the United States}, journal = {Criminal Justice Review}, volume = {1}, year = {2015}, author = {Brucker, D.} } @conference {2118, title = {Predicting Breakoff Using Sequential Machine Learning Methods}, booktitle = {70th Annual Conference of the American Association for Public Opinion Research (AAPOR)}, year = {2015}, month = {05/2015}, address = {Hollywood, Florida}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Soh, L.-K. and Eck, A. and McCutcheon, A.L.} } @techreport {handle:1813:40172, title = {Presentation: NADDI 2015: Crowdsourcing DDI Development: New Features from the CED2AR Project}, number = {1813:40172}, year = {2015}, institution = {Cornell University}, type = {Preprint}, abstract = {Presentation: NADDI 2015: Crowdsourcing DDI Development: New Features from the CED2AR Project Perry, Benjamin; Kambhampaty, Venkata; Brumsted, Kyle; Vilhuber, Lars; Block, William Recent years have shown the power of user-sourced information evidenced by the success of Wikipedia and its many emulators. This sort of unstructured discussion is currently not feasible as a part of the otherwise successful metadata repositories. Creating and augmenting metadata is a labor-intensive endeavor. Harnessing collective knowledge from actual data users can supplement officially generated metadata. As part of our Comprehensive Extensible Data Documentation and Access Repository (CED2AR) infrastructure, we demonstrate a prototype of crowdsourced DDI, using DDI-C and supplemental XML. The system allows for any number of network connected instances (web or desktop deployments) of the CED2AR DDI editor to concurrently create and modify metadata. The backend transparently handles changes, and frontend has the ability to separate official edits (by designated curators of the data and the metadata) from crowd-sourced content. We briefly discuss offline edit contributions as well. CED2AR uses DDI-C and supplemental XML together with Git for a very portable and lightweight implementation. This distributed network implementation allows for large scale metadata curation without the need for a hardware intensive computing environment, and can leverage existing cloud services, such as Github or Bitbucket. Ben Perry (Cornell/NCRN) presents joint work with Venkata Kambhampaty, Kyle Brumsted, Lars Vilhuber, \& William C. Block at NADDI 2015.}, url = {http://hdl.handle.net/1813/40172}, author = {Perry, Benjamin and Kambhampaty, Venkata and Brumsted, Kyle and Vilhuber, Lars and Block, William} } @article {2257, title = {Preventive policy strategy for banking the unbanked: Savings accounts for teenagers?}, journal = {Journal of Poverty }, volume = {20}, year = {2015}, month = {07/2015}, pages = {2-33}, chapter = {2}, abstract = {Concern over percentages of unbanked and underbanked households in the United States and their lack of connectedness to the financial mainstream has led to policy strategies geared toward reaching these households. Using nationally-representative longitudinal data, a preventive strategy for banking households is tested that asks whether young adults are more likely to be banked and own a diversity of financial assets when they are connected to the financial mainstream as teenagers. Young adults are more likely to own checking accounts, savings accounts, certificates of deposit, and stocks when they had savings accounts as teenagers. Policy implications are discussed.}, keywords = {financial assets, savings, Survey of Income and Program Participation (SIPP), teenagers, unbanked, young adults}, doi = {10.1080/10875549.2015.1015068}, url = {http://www.tandfonline.com/doi/full/10.1080/10875549.2015.1015068}, author = {Friedline, T. and Despard, M. and Chowa, G.} } @article {1873, title = {Privacy and human behavior in the age of information}, journal = {Science}, volume = {347}, year = {2015}, chapter = {509}, abstract = {This Review summarizes and draws connections between diverse streams of empirical research on privacy behavior. We use three themes to connect insights from social and behavioral sciences: people{\textquoteright}s uncertainty about the consequences of privacy-related behaviors and their own preferences over those consequences; the context-dependence of people{\textquoteright}s concern, or lack thereof, about privacy; and the degree to which privacy concerns are malleable{\textemdash}manipulable by commercial and governmental interests. Organizing our discussion by these themes, we offer observations concerning the role of public policy in the protection of privacy in the information age.}, keywords = {confidentiality, privacy}, doi = {10.1126/science.aaa1465}, url = {http://www.sciencemag.org/content/347/6221/509}, author = {Alessandro Acquisti and Laura Brandimarte and George Loewenstein} } @mastersthesis {Shrivastava2015, title = {Probabilistic Hashing Techniques For Big Data}, volume = {Ph.D. }, year = {2015}, school = {Cornell University}, type = {Dissertation}, abstract = {We investigate probabilistic hashing techniques for addressing computational and memory challenges in large scale machine learning and data mining systems. In this thesis, we show that the traditional idea of hashing goes far beyond near-neighbor search and there are some striking new possibilities. We show that hashing can improve state of the art large scale learning algorithms, and it goes beyond the conventional notions of pairwise similarities. Despite being a very well studied topic in literature, we found several opportunities for fundamentally improving some of the well know textbook hashing algorithms. In particular, we show that the traditional way of computing minwise hashes is unnecessarily expensive and without loosing anything we can achieve an order of magnitude speedup. We also found that for cosine similarity search there is a better scheme than SimHash. In the end, we show that the existing locality sensitive hashing framework itself is very restrictive, and we cannot have efficient algorithms for some important measures like inner products which are ubiquitous in machine learning. We propose asymmetric locality sensitive hashing (ALSH), an extended framework, where we show provable and practical efficient algorithms for Maximum Inner Product Search (MIPS). Having such an efficient solutions to MIPS directly scales up many popular machine learning algorithms. We believe that this thesis provides significant improvements to some of the heavily used subroutines in big-data systems, which we hope will be adopted.}, url = {https://ecommons.cornell.edu/handle/1813/40886}, author = {Anshumali Shrivastava} } @mastersthesis {2268, title = {Ranking Firms Using Revealed Preference and Other Essays About Labor Markets}, year = {2015}, school = {University of Michigan}, type = {Ph.D.}, address = {Ann Arbor, MI}, abstract = {This dissertation contains essays on three questions about the labor market. Chapter 1 considers the question: why do some firms pay so much and some so little? Firms account for a substantial portion of earnings inequality. Although the standard explanation is that there are search frictions that support an equilibrium with rents, this chapter finds that compensating differentials for nonpecuniary characteristics are at least as important. To reach this finding, this chapter develops a structural search model and estimates it on U.S. administrative data. The model analyzes the revealed preference information in the labor market: specifically, how workers move between the 1.5 million firms in the data. With on the order of 1.5 million parameters, standard estimation approaches are infeasible and so the chapter develops a new estimation approach that is feasible on such big data. Chapter 2 considers the question: why do men and women work at different firms? Men work for higher-paying firms than women. The chapter builds on chapter 1 to consider two explanations for why men and women work in different firms. First, men and women might search from different offer distributions. Second, men and women might have different rankings of firms. Estimation finds that the main explanation for why men and women are sorted is that women search from a lower-paying offer distribution than men. Indeed, men and women are estimated to have quite similar rankings of firms. Chapter 3 considers the question: what are there long-run effects of the minimum wage? An empirical consensus suggests that there are small employment effects of minimum wage increases. This chapter argues that these are short-run elasticities. Long-run elasticities, which may differ from short-run elasticities, are more policy relevant. This chapter develops a dynamic industry equilibrium model of labor demand. The model makes two points. First, long-run regressions have been misinterpreted because even if the short- and long-run employment elasticities differ, standard methods would not detect a difference using U.S. variation. Second, the model offers a reconciliation of the small estimated short-run employment effects with the commonly found pass-through of minimum wage increases to product prices.}, keywords = {economics, labor markets}, url = {http://hdl.handle.net/2027.42/116747}, author = {Isaac Sorkin} } @article {1787, title = {Record Linkage using STATA: Pre-processing, Linking and Reviewing Utilities}, journal = {The Stata Journal}, volume = {15}, year = {2015}, pages = {1-15}, abstract = {In this article, we describe Stata utilities that facilitate probabilistic record linkage{\textemdash}the technique typically used for merging two datasets with no common record identifier. While the preprocessing tools are developed specifically for linking two company databases, the other tools can be used for many different types of linkage. Specifically, the stnd_compname and stnd_address commands parse and standardize company names and addresses to improve the match quality when linking. The reclink2 command is a generalized version of Blasnik{\textquoteright}s reclink (2010, Statistical Software Components S456876, Department of Economics, Boston College) that allows for many-to-one matching. Finally, clrevmatch is an interactive tool that allows the user to review matched results in an efficient and seamless manner. Rather than exporting results to another file format (for example, Excel), inputting clerical reviews, and importing back into Stata, one can use the clrevmatch tool to conduct all of these steps within Stata. This helps improve the speed and flexibility of matching, which often involves multiple runs.}, url = {http://www.stata-journal.com/article.html?article=dm0082}, author = {Wasi, Nada and Flaaen, Aaron} } @conference {2117, title = {Recording What the Respondent Says: Does Question Format Matter?}, booktitle = {70th Annual Conference of the American Association for Public Opinion Research (AAPOR)}, year = {2015}, address = {Hollywood, Florida}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Smyth, J.D. and Olson, K.} } @article {spielman2015plos, title = {Reducing the Margins of Error in the American Community Survey Through Data-Driven Regionalization}, journal = {PlosOne}, year = {2015}, month = {02/2015}, doi = {10.1371/journal.pone.0115626}, url = {http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0115626}, author = {Folch, D. and Spielman, S. E.} } @article {2015arXiv:1502.01974, title = {Regionalization of Multiscale Spatial Processes using a Criterion for Spatial Aggregation Error}, journal = {ArXiv}, year = {2015}, abstract = {The modifiable areal unit problem and the ecological fallacy are known problems that occur when modeling multiscale spatial processes. We investigate how these forms of spatial aggregation error can guide a regionalization over a spatial domain of interest. By "regionalization" we mean a specification of geographies that define the spatial support for areal data. This topic has been studied vigorously by geographers, but has been given less attention by spatial statisticians. Thus, we propose a criterion for spatial aggregation error (CAGE), which we minimize to obtain an optimal regionalization. To define CAGE we draw a connection between spatial aggregation error and a new multiscale representation of the Karhunen-Loeve (K-L) expansion. This relationship between CAGE and the multiscale K-L expansion leads to illuminating theoretical developments including: connections between spatial aggregation error, squared prediction error, spatial variance, and a novel extension of Obled-Creutin eigenfunctions. The effectiveness of our approach is demonstrated through an analysis of two datasets, one using the American Community Survey and one related to environmental ocean winds.}, url = {http://arxiv.org/abs/1502.01974}, author = {Bradley, J.~R. and Wikle, C.K. and Holan, S.~H.} } @article {2084, title = {Rejoinder on: Comparing and selecting spatial predictors using local criteria}, journal = {Test}, volume = {24}, year = {2015}, month = {03/2015}, pages = {54-60}, issn = {1133-0686}, doi = {10.1007/s11749-014-0414-2}, url = {http://dx.doi.org/10.1007/s11749-014-0414-2}, author = {Bradley, J.R. and Cressie, N. and Shi, T.} } @mastersthesis {2197, title = {Relaxations of differential privacy and risk utility evaluations of synthetic data and fidelity measures}, volume = {PhD}, year = {2015}, school = {Duke University }, abstract = {Many organizations collect data that would be useful to public researchers, but cannot be shared due to promises of confidentiality to those that participated in the study. This thesis evaluates the risks and utility of several existing release methods, as well as develops new ones with different risk/utility tradeoffs. In Chapter 2, I present a new risk metric, called model-specific probabilistic differ- ential privacy (MPDP), which is a relaxed version of differential privacy that allows the risk of a release to be based on the worst-case among plausible datasets instead of all possible datasets. In addition, I develop a generic algorithm called local sensitiv- ity random sampling (LSRS) that, under certain assumptions, is guaranteed to give releases that meet MPDP for any query with computable local sensitivity. I demon- strate, using several well-known queries, that LSRS releases have much higher utility than standard differentially private release mechanism, the Laplace Mechanism, at only marginally higher risk. In Chapter 3, using to synthesis models, I empirically characterize the risks of releasing synthetic data under the standard {\textquotedblleft}all but one{\textquotedblright} assumption on intruder background knowledge, as well the effect decreasing the number of observations the intruder knows beforehand has on that risk. I find in these examples that even in the {\textquotedblleft}all but one{\textquotedblright} case, there is no risk except to extreme outliers, and even then the risk is mild. I find that the effect of removing observations from an intruder{\textquoteright}s background knowledge has on risk heavily depends on how well that intruder can fill in those missing observations: the risk remains fairly constant if he/she can fill them in well, and the risk drops quickly if he/she cannot. In Chapter 4, I characterize the risk/utility tradeoffs for an augmentation of synthetic data called fidelity measures (see Section 1.2.3). Fidelity measures were proposed in Reiter et al. (2009) to quantify the degree to which the results of an analysis performed on a released synthetic dataset match with the results of the same analysis performed on the confidential data. I compare the risk/utility of two different fidelity measures, the confidence interval overlap (Karr et al., 2006) and a new fidelity measure I call the mean predicted probability difference (MPPD). Simultaneously, I compare the risk/utility tradeoffs of two different private release mechanisms, LSRS and a heuristic release method called {\textquotedblleft}safety zones{\textquotedblright}. I find that the confidence interval overlap can be applied to a wider variety of analyses and is more specific than MPPD, but MPPD is more robust to the influence of individual observations in the confidential data, which means it can be released with less noise than the confidence interval overlap with the same level of risk. I also find that while safety zones are much simpler to compute and generally have good utility (whereas the utility of LSRS depends on the value of ε), it is also much more vulnerable to context specific attacks that, while not easy for an intruder to implement, are difficult to anticipate.}, url = {http://hdl.handle.net/10161/11365}, author = {McClure, D.} } @conference {2111, title = {The Role of Device Type and Respondent Characteristics in Internet Panel Survey Breakoff}, booktitle = {70th Annual Conference of the American Association for Public Opinion Research (AAPOR)}, year = {2015}, address = {Hollywood, Florida}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Allan L. McCutcheon} } @article {2085, title = {The SAR model for very large datasets: A reduced-rank approach}, journal = {Econometrics}, volume = {3}, year = {2015}, pages = {317-338}, issn = {2225-1146}, doi = {10.3390/econometrics3020317}, url = {http://www.mdpi.com/2225-1146/3/2/317}, author = {Burden, S. and Cressie, N. and Steel, D.G.} } @article {1575, title = {Semi-parametric selection models for potentially non-ignorable attrition in panel studies with refreshment samples}, journal = {Political Analysis}, volume = {23}, year = {2015}, pages = {92-112}, chapter = {92}, url = {http://pan.oxfordjournals.org/cgi/reprint/mpu009?\%20ijkey=joX8eSl6gyIlQKP\&keytype=ref}, author = {Y. Si and J.P. Reiter and D.S. Hillygus} } @article {1931, title = {Simultaneous Edit-Imputation for Continuous Microdata}, journal = {Journal of the American Statistical Association}, volume = {110}, year = {2015}, pages = {987-999}, doi = {10.1080/01621459.2015.1040881}, url = {http://www.tandfonline.com/doi/abs/10.1080/01621459.2015.1040881}, author = {Kim, H. J. and Cox, L. H. and Karr, A. F. and Reiter, J. P. and Wang, Q.} } @article {1742, title = {Small Area Estimation via Multivariate Fay-Herriot Models With Latent Spatial Dependence}, journal = {Australian \& New Zealand Journal of Statistics}, volume = {57}, year = {2015}, pages = {15-29}, url = {http://arxiv.org/abs/1310.7211}, author = {Porter, A.T. and Wikle, C.K. and Holan, S.H.} } @article {STA4:STA494, title = {Spatio-temporal change of support with application to American Community Survey multi-year period estimates}, journal = {Stat}, volume = {4}, year = {2015}, month = {10/2015}, pages = {255{\textendash}270}, abstract = {We present hierarchical Bayesian methodology to perform spatio-temporal change of support (COS) for survey data with Gaussian sampling errors. This methodology is motivated by the American Community Survey (ACS), which is an ongoing survey administered by the US Census Bureau that provides timely information on several key demographic variables. The ACS has published 1-year, 3-year, and 5-year period estimates, and margins of errors, for demographic and socio-economic variables recorded over predefined geographies. The spatio-temporal COS methodology considered here provides data users with a way to estimate ACS variables on customized geographies and time periods while accounting for sampling errors. Additionally, 3-year ACS period estimates are to be discontinued, and this methodology can provide predictions of ACS variables for 3-year periods given the available period estimates. The methodology is based on a spatio-temporal mixed-effects model with a low-dimensional spatio-temporal basis function representation, which provides multi-resolution estimates through basis function aggregation in space and time. This methodology includes a novel parameterization that uses a target dynamical process and recently proposed parsimonious Moran{\textquoteright}s I propagator structures. Our approach is demonstrated through two applications using public-use ACS estimates and is shown to produce good predictions on a hold-out set of 3-year period estimates. Copyright {\textcopyright} 2015 John Wiley \& Sons, Ltd.}, keywords = {Bayesian, change-of-support, dynamical, hierarchical models, mixed-effects model, Moran{\textquoteright}s I, multi-year period estimate}, issn = {2049-1573}, doi = {10.1002/sta4.94}, url = {http://dx.doi.org/10.1002/sta4.94}, author = {Bradley, Jonathan R. and Wikle, Christopher K. and Holan, Scott H.} } @article {1596, title = {Statistical Disclosure Limitation in the Presence of Edit Rules}, journal = {Journal of Official Statistics}, volume = {31}, year = {2015}, pages = {121-138}, chapter = {121}, author = {Kim, H.J. and Karr, A.F. and Reiter, J.P.} } @article {2091, title = {A stochastic bioenergetics model based approach to translating large river flow and temperature in to fish population responses: the pallid sturgeon example}, journal = {Geological Society}, volume = {408}, year = {2015}, issn = {2041-4927}, doi = {10.1144/SP408.10}, author = {Wildhaber, M.L. and Dey, R. and Wikle, C.K. and Anderson, C.J. and Moran, E.H. and Franz, K.J.} } @article {2185, title = {Stop or continue data collection: A nonignorable missing data approach for continuous variables}, journal = {ArXiv}, year = {2015}, month = {11/2015}, abstract = {We present an approach to inform decisions about nonresponse followup sampling. The basic idea is (i) to create completed samples by imputing nonrespondents{\textquoteright} data under various assumptions about the nonresponse mechanisms, (ii) take hypothetical samples of varying sizes from the completed samples, and (iii) compute and compare measures of accuracy and cost for different proposed sample sizes. As part of the methodology, we present a new approach for generating imputations for multivariate continuous data with nonignorable unit nonresponse. We fit mixtures of multivariate normal distributions to the respondents{\textquoteright} data, and adjust the probabilities of the mixture components to generate nonrespondents{\textquoteright} distributions with desired features. We illustrate the approaches using data from the 2007 U. S. Census of Manufactures. }, keywords = {Methodology}, url = {http://arxiv.org/abs/1511.02189}, author = {T. Paiva and J.P. Reiter} } @article {doi:10.1080/00045608.2015.1052335, title = {Studying Neighborhoods Using Uncertain Data from the American Community Survey: A Contextual Approach}, journal = {Annals of the Association of American Geographers}, volume = {105}, number = {5}, year = {2015}, pages = {1003-1025}, abstract = {In 2010 the American Community Survey (ACS) replaced the long form of the decennial census as the sole national source of demographic and economic data for small geographic areas such as census tracts. These small area estimates suffer from large margins of error, however, which makes the data difficult to use for many purposes. The value of a large and comprehensive survey like the ACS is that it provides a richly detailed, multivariate, composite picture of small areas. This article argues that one solution to the problem of large margins of error in the ACS is to shift from a variable-based mode of inquiry to one that emphasizes a composite multivariate picture of census tracts. Because the margin of error in a single ACS estimate, like household income, is assumed to be a symmetrically distributed random variable, positive and negative errors are equally likely. Because the variable-specific estimates are largely independent from each other, when looking at a large collection of variables these random errors average to zero. This means that although single variables can be methodologically problematic at the census tract scale, a large collection of such variables provides utility as a contextual descriptor of the place(s) under investigation. This idea is demonstrated by developing a geodemographic typology of all U.S. census tracts. The typology is firmly rooted in the social scientific literature and is organized around a framework of concepts, domains, and measures. The typology is validated using public domain data from the City of Chicago and the U.S. Federal Election Commission. The typology, as well as the data and methods used to create it, is open source and published freely online. }, doi = {10.1080/00045608.2015.1052335}, url = {http://dx.doi.org/10.1080/00045608.2015.1052335}, author = {Seth E. Spielman and Alex Singleton} } @conference {2110, title = {Survey Informatics: The Future of Survey Methodology and Survey Statistics Training in the Academy?}, booktitle = {70th Annual Conference of the American Association for Public Opinion Research (AAPOR)}, year = {2015}, address = {Hollywood, Florida}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Allan L. McCutcheon} } @techreport {handle:1813:42340, title = {Synthetic Establishment Microdata Around the World}, number = {1813:42340}, year = {2015}, institution = {Cornell University}, type = {Preprint}, abstract = {Synthetic Establishment Microdata Around the World Vilhuber, Lars; Abowd, John A.; Reiter, Jerome P. In contrast to the many public-use microdata samples available for individual and household data from many statistical agencies around the world, there are virtually no establishment or firm microdata available. In large part, this difficulty in providing access to business micro data is due to the skewed and sparse distributions that characterize business data. Synthetic data are simulated data generated from statistical models. We organized sessions at the 2015 World Statistical Congress and the 2015 Joint Statistical Meetings, highlighting work on synthetic establishment microdata. This overview situates those papers, published in this issue, within the broader literature.}, url = {http://hdl.handle.net/1813/42340}, author = {Vilhuber, Lars and Abowd, John A. and Reiter, Jerome P.} } @article {2419, title = {Understanding the Dynamics of $2-a-Day Poverty in the United States}, journal = {The Russell Sage Foundation Journal of the Social Sciences}, volume = {1}, year = {2015}, author = {Shaefer, H. Luke and Edin, Kathryn and Talbert, E.} } @article {2206, title = {Understanding the Human Condition through Survey Informatics}, journal = {IEEE Computer}, volume = {48}, year = {2015}, pages = {112-116}, issn = {0018-9162}, doi = {10.1109/MC.2015.327}, author = {Eck, A. and Leen-Kiat, S. and McCutcheon, A. L. and Smyth, J.D. and Belli, R.F.} } @conference {2106, title = {The Use of Paradata to Evaluate Interview Complexity and Data Quality (in Calendar and Time Diary Surveys)}, booktitle = {70th Annual Conference of the American Association for Public Opinion Research (AAPOR)}, year = {2015}, address = {Hollywood, Florida}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Cordova-Cazar, A.L. and Belli, R.F.} } @conference {2105, title = {Using Data Mining to Examine Interviewer-Respondent Interactions in Calendar Interviews}, booktitle = {70th Annual Conference of the American Association for Public Opinion Research (AAPOR)}, year = {2015}, month = {05/2015}, address = {Hollywood, Florida}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Belli, R.F. and Miller, L.D. and Soh, L.-K. and T. Al Baghal} } @conference {2104, title = {Using Machine Learning Techniques to Predict Respondent Type from A Priori Demographic Information}, booktitle = {70th Annual Conference of the American Association for Public Opinion Research (AAPOR)}, year = {2015}, month = {May 14-17, 2015}, address = {Hollywood, Florida}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Atkin, G. and Arunachalam, H. and Eck, A. and Wettlaufer, D. and Soh, L.-K. and Belli, R.F.} } @techreport {handle:1813:42339, title = {Using Partially Synthetic Microdata to Protect Sensitive Cells in Business Statistics}, number = {1813:42339}, year = {2015}, institution = {Cornell University}, type = {Preprint}, abstract = {Using Partially Synthetic Microdata to Protect Sensitive Cells in Business Statistics Vilhuber, Lars; Miranda, Javier We describe and analyze a method that blends records from both observed and synthetic microdata into public-use tabulations on establishment statistics. The resulting tables use synthetic data only in potentially sensitive cells. We describe different algorithms, and present preliminary results when applied to the Census Bureau{\textquoteright}s Business Dynamics Statistics and Synthetic Longitudinal Business Database, highlighting accuracy and protection afforded by the method when compared to existing public-use tabulations (with suppressions).}, url = {http://hdl.handle.net/1813/42339}, author = {Vilhuber, Lars and Miranda, Javier} } @conference {2113, title = {Web Surveys, Online Panels, and Paradata: Automating Responsive Design}, booktitle = {2015 Joint Program in Survey Methodology (JPSM) Distinguished Lecture}, year = {2015}, month = {04/2015}, address = {University of Maryland. College Park, MD}, url = {http://www.jpsm.umd.edu/}, author = {Allan L. McCutcheon} } @article {1819, title = {Who{\textquoteright}s Left Out? Characteristics of Households in Economic Need not Receiving Public Support}, journal = {Journal of Sociology and Social Welfare}, volume = {42}, year = {2015}, pages = {65-85}, author = {Fusaro, V.} } @conference {2114, title = {Why Do Interviewers Speed Up? An Examination of Changes in Interviewer Behaviors over the Course of the Survey Field Period}, booktitle = {70th Annual Conference of the American Association for Public Opinion Research (AAPOR)}, year = {2015}, address = {Hollywood, Florida}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Olson, K. and Smyth, J.D.} } @conference {2162, title = {Achieving balance: Understanding the relationship between complexity and response quality}, booktitle = {American Association for Public Opinion Research 2014 Annual Conference}, year = {2014}, address = {Anaheim, CA}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Powell, R.J. and Kirchner, A.} } @article {Wikle2014, title = {Agent Based Models: Statistical Challenges and Opportunities}, journal = {Statistics Views}, year = {2014}, publisher = {Wiley}, url = {http://www.statisticsviews.com/details/feature/6354691/Agent-Based-Models-Statistical-Challenges-and-Opportunities.html}, author = {Wikle, C.K.} } @inbook {1573, title = {Analytical frameworks for data release: A statistical view}, booktitle = {Confidentiality and Data Access in the Use of Big Data: Theory and Practical Approaches}, year = {2014}, publisher = {Cambridge University Press}, organization = {Cambridge University Press}, address = {New York City, NY}, author = {A. F. Karr and J. P. Reiter} } @booklet {Holan2014b, title = {An Approach for Identifying and Predicting Economic Recessions in Real-Time Using Time-Frequency Functional Models, Seminar on Bayesian Inference in Econometrics and Statistics (SBIES)}, year = {2014}, month = {May}, author = {Holan, S.H.} } @conference {Holan2014, title = {An Approach for Identifying and Predicting Economic Recessions in Real-Time Using Time-Frequency Functional Models}, booktitle = {Joint Statistical Meetings 2014}, year = {2014}, month = {August}, publisher = {Joint Statistical Meetings}, organization = {Joint Statistical Meetings}, address = {Boston, MA}, doi = {10.1002/asmb.1954}, url = {http://www.amstat.org/meetings/jsm/2014/onlineprogram/AbstractDetails.cfm?abstractid=310841}, author = {Holan, S.H.} } @article {McElroy2014, title = {Asymptotic Theory of Cepstral Random Fields}, journal = {Annals of Statistics}, volume = {42}, year = {2014}, pages = {64-86}, publisher = {University of Missouri}, doi = {10.1214/13-AOS1180}, url = {http://arxiv.org/pdf/1112.1977v4.pdf}, author = {McElroy, T. and Holan, S.} } @inbook {Belli2014, title = {Autobiographical memory dynamics in survey research}, booktitle = {SAGE Handbook of Applied Memory}, year = {2014}, publisher = {Sage}, organization = {Sage}, doi = {10.4135/9781446294703}, url = {http://dx.doi.org/10.4135/9781446294703}, author = {Belli, R. F.}, editor = {T. J. Perfect and D. S. Lindsay} } @booklet {Holan2014d, title = {A Bayesian Approach to Estimating Agricultural Yield Based on Multiple Repeated Surveys}, year = {2014}, month = {March}, author = {Holan, S.H.} } @conference {Holan2014a, title = {Bayesian Dynamic Time-Frequency Estimation}, booktitle = {Twelfth World Meeting of ISBA}, year = {2014}, month = {July}, publisher = {ISBA}, organization = {ISBA}, address = {Cancun, Mexico}, author = {Holan, S.H.} } @article {1527, title = {Bayesian estimation of disclosure risks for multiply imputed, synthetic data}, journal = {Journal of Privacy and Confidentiality}, volume = {6}, year = {2014}, month = {2014}, abstract = {

Agencies seeking to disseminate public use microdata, i.e., data on individual records, can replace confidential values with multiple draws from statistical models estimated with the collected data. We present a famework for evaluating disclosure risks inherent in releasing multiply-imputed, synthetic data. The basic idea is to mimic an intruder who computes posterior distributions of confidential values given the released synthetic data and prior knowledge. We illustrate the methodology with artificial fully synthetic data and with partial synthesis of the Survey of Youth in Custody.

}, url = {http://repository.cmu.edu/jpc/vol6/iss1/2}, author = {Reiter, J. P. and Wang, Q. and Zhang, B.} } @article {ManriqueReiter2013, title = {Bayesian estimation of discrete multivariate latent structure models with structural zeros}, journal = {Journal of Computational and Graphical Statistics}, volume = {23}, year = {2014}, pages = {1061-1079}, author = {Manrique-Vallier, D. and Reiter, J.P.} } @article {Manrique-Vallierforthcoming, title = {Bayesian multiple imputation for large-scale categorical data with structural zeros}, journal = {Survey Methodology}, volume = {40}, year = {2014}, month = {06/2014}, pages = {125-134}, url = {http://www.stat.duke.edu/~jerry/Papers/SurvMeth14.pdf}, author = {D. Manrique-Vallier and J.P. Reiter} } @techreport {2014arXiv1408.1027D, title = {{Bayesian Nonparametric Modeling for Multivariate Ordinal Regression}}, number = {1408.1027}, year = {2014}, institution = {ArXiv}, abstract = {Univariate or multivariate ordinal responses are often assumed to arise from a latent continuous parametric distribution, with covariate effects which enter linearly. We introduce a Bayesian nonparametric modeling approach for univariate and multivariate ordinal regression, which is based on mixture modeling for the joint distribution of latent responses and covariates. The modeling framework enables highly flexible inference for ordinal regression relationships, avoiding assumptions of linearity or additivity in the covariate effects. In standard parametric ordinal regression models, computational challenges arise from identifiability constraints and estimation of parameters requiring nonstandard inferential techniques. A key feature of the nonparametric model is that it achieves inferential flexibility, while avoiding these difficulties. In particular, we establish full support of the nonparametric mixture model under fixed cut-off points that relate through discretization the latent continuous responses with the ordinal responses. The practical utility of the modeling approach is illustrated through application to two data sets from econometrics, an example involving regression relationships for ozone concentration, and a multirater agreement problem.}, keywords = {Statistics - Methodology}, url = {http://arxiv.org/abs/1408.1027}, author = {DeYoreo, M. and Kottas, A.} } @booklet {Porter2014c, title = {Big Data Methodology Applied to Small Area Estimation}, year = {2014}, month = {January}, author = {Porter, A.T.} } @conference {2145, title = {Call back later: The association of recruitment contact and error in the American Time Use Survey}, booktitle = {American Association for Public Opinion Research 2014 Annual Conference}, year = {2014}, address = {Anaheim, CA}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Countryman, A. and Cordova-Cazar, A.L. and Deal, C.E. and Belli, R.F.} } @article {1631, title = {A CAR model for multiple outcomes on mismatched lattices}, journal = {Spatial and Spatio-Temporal Epidemiology}, volume = {11}, year = {2014}, pages = {79-88}, chapter = {79}, doi = {10.1016/j.sste.2014.08.001}, url = {http://www.sciencedirect.com/science/article/pii/S1877584514000604}, author = {Porter, A.T. and Oleson, J.} } @article {spielman2014causes, title = {Causes and Patterns of Uncertainty in the American Community Survey}, journal = {Applied Geography}, volume = {46}, year = {2014}, pages = {147-157}, doi = {DOI: 10.1016/j.apgeog.2013.11.002 http://dx.doi.org/10.1016/j.apgeog.2013.11.002}, url = {http://www.sciencedirect.com/science/article/pii/S0143622813002518}, author = {Spielman, S. E. and Folch, D. and Nagle, N.} } @techreport {handle:1813:44702, title = {CED 2 AR: The Comprehensive Extensible Data Documentation and Access Repository}, number = {1813:44702}, year = {2014}, institution = {Cornell University}, type = {Preprint}, abstract = {CED 2 AR: The Comprehensive Extensible Data Documentation and Access Repository Lagoze, Carl; Vilhuber, Lars; Williams, Jeremy; Perry, Benjamin; Block, William C. We describe the design, implementation, and deployment of the Comprehensive Extensible Data Documentation and Access Repository (CED 2 AR). This is a metadata repository system that allows researchers to search, browse, access, and cite confidential data and metadata through either a web-based user interface or programmatically through a search API, all the while re-reusing and linking to existing archive and provider generated metadata. CED 2 AR is distinguished from other metadata repository-based applications due to requirements that derive from its social science context. These include the need to cloak confidential data and metadata and manage complex provenance chains Presented at 2014 IEEE/ACM Joint Conference on Digital Libraries (JCDL), Sept 8-12, 2014}, url = {http://hdl.handle.net/1813/44702}, author = {Lagoze, Carl and Vilhuber, Lars and Williams, Jeremy and Perry, Benjamin and Block, William C.} } @techreport {HolanMcElroyWu2014, title = {The Cepstral Model for Multivariate Time Series: The Vector Exponential Model.}, number = {1406.0801}, year = {2014}, institution = {arXiv}, type = {preprint}, abstract = {

Vector autoregressive (VAR) models have become a staple in the analysis of multivariate time series and are formulated in the time domain as difference equations, with an implied covariance structure. In many contexts, it is desirable to work with a stable, or at least stationary, representation. To fit such models, one must impose restrictions on the coefficient matrices to ensure that certain determinants are nonzero; which, except in special cases, may prove burdensome. To circumvent these difficulties, we propose a flexible frequency domain model expressed in terms of the spectral density matrix. Specifically, this paper treats the modeling of covariance stationary vector-valued (i.e., multivariate) time series via an extension of the exponential model for the spectrum of a scalar time series. We discuss the modeling advantages of the vector exponential model and its computational facets, such as how to obtain Wold coefficients from given cepstral coefficients. Finally, we demonstrate the utility of our approach through simulation as well as two illustrative data examples focusing on multi-step ahead forecasting and estimation of squared coherence.

}, url = {http://arxiv.org/abs/1406.0801}, author = {Holan, S.H. and McElroy, T.S. and Wu, G.} } @conference {2158, title = {Changes in interviewer-related error over the course of the field period: An empirical examination using paradata}, booktitle = {Joint Statistical Meetings}, year = {2014}, address = {Boston, MA}, author = {Olson, K. and Kirchner, A.} } @conference {2157, title = {Changes in interviewer-related error over the course of the field period: An empirical examination using paradata}, booktitle = {American Association for Public Opinion Research 2014 Annual Conference}, year = {2014}, address = {Anaheim, CA}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Olson, K. and Kirchner, A.} } @article {spielman2014coevolution, title = {The Co-Evolution of Residential Segregation and the Built Environment at the Turn of the 20th Century: a Schelling Model}, journal = {Transactions in GIS}, volume = {18}, number = {1}, year = {2014}, pages = {25-45}, doi = {DOI: 10.1111/tgis.12014}, url = {http://onlinelibrary.wiley.com/enhanced/doi/10.1111/tgis.12014/}, author = {Spielman, S. E. and Harrison, P.} } @techreport {handle:1813:38200, title = {Collaborative Editing of DDI Metadata: The Latest from the CED2AR Project}, number = {1813:38200}, year = {2014}, institution = {Cornell University}, type = {Preprint}, abstract = {Collaborative Editing of DDI Metadata: The Latest from the CED2AR Project Perry, Benjamin; Kambhampaty, Venkata; Brumsted, Kyle; Vilhuber, Lars; Block, William Benjamin Perry{\textquoteright}s presentation on \"Collaborative Editing and Versioning of DDI Metadata: The Latest from Cornell{\textquoteright}s NCRN CED{\texttwosuperior}AR Software\" at the 6th Annual European DDI User Conference in London, 12/02/2014.}, url = {http://hdl.handle.net/1813/38200}, author = {Perry, Benjamin and Kambhampaty, Venkata and Brumsted, Kyle and Vilhuber, Lars and Block, William} } @conference {2150, title = {Commitment, concealment, and confusion: An empirical assessment of interviewer and respondent behaviors in survey interviews}, booktitle = {39th Annual Conference of the Midwest Association for Public Opinion Research}, year = {2014}, month = {11/2014}, address = {Chicago, IL}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Kirchner, A. and Olson, K.} } @techreport {handle:1813:36323, title = {Communicating Uncertainty in Official Economic Statistics}, number = {1813:36323}, year = {2014}, institution = {Northwestern University}, type = {Preprint}, abstract = {Communicating Uncertainty in Official Economic Statistics Manski, Charles Federal statistical agencies in the United States and analogous agencies elsewhere commonly report official economic statistics as point estimates, without accompanying measures of error. Users of the statistics may incorrectly view them as error-free or may incorrectly conjecture error magnitudes. This paper discusses strategies to mitigate misinterpretation of official statistics by communicating uncertainty to the public. Sampling error can be measured using established statistical principles. The challenge is to satisfactorily measure the various forms of nonsampling error. I find it useful to distinguish transitory statistical uncertainty, permanent statistical uncertainty, and conceptual uncertainty. I illustrate how each arises as the Bureau of Economic Analysis periodically revises GDP estimates, the Census Bureau generates household income statistics from surveys with nonresponse, and the Bureau of Labor Statistics seasonally adjusts employment statistics.}, url = {http://hdl.handle.net/1813/36323}, author = {Manski, Charles} } @techreport {handle:1813:40830, title = {Communicating Uncertainty in Official Economic Statistics: An Appraisal Fifty Years after Morgenstern}, number = {1813:40830}, year = {2014}, month = {10/2014}, institution = {Northwestern University}, type = {Preprint}, abstract = {

Communicating Uncertainty in Official Economic Statistics: An Appraisal Fifty Years after Morgenstern Manski, Charles F. Federal statistical agencies in the United States and analogous agencies elsewhere commonly report official economic statistics as point estimates, without accompanying measures of error. Users of the statistics may incorrectly view them as error-free or may incorrectly conjecture error magnitudes. This paper discusses strategies to mitigate misinterpretation of official statistics by communicating uncertainty to the public. Sampling error can be measured using established statistical principles. The challenge is to satisfactorily measure the various forms of nonsampling error. I find it useful to distinguish transitory statistical uncertainty, permanent statistical uncertainty, and conceptual uncertainty. I illustrate how each arises as the Bureau of Economic Analysis periodically revises GDP estimates, the Census Bureau generates household income statistics from surveys with nonresponse, and the Bureau of Labor Statistics seasonally adjusts employment statistics. I anchor my discussion of communication of uncertainty in the contribution of Morgenstern (1963), who argued forcefully for agency publication of error estimates for official economic statistics.

}, url = {http://hdl.handle.net/1813/40830}, author = {Manski, Charles F.} } @mastersthesis {moehl2014comparing, title = {Comparing models of Demographic Subpopulations (Master{\textquoteright}s Thesis)}, year = {2014}, school = {University of Tennessee}, type = {masters}, url = {http://trace.tennessee.edu/utk_gradthes/2835/; http://trace.tennessee.edu/cgi/viewcontent.cgi?article=4005\&context=utk_gradthes}, author = {Moehl, J.} } @inbook {ste:ven:sad:2014, title = {A Comparison of Blocking Methods for Record Linkage}, booktitle = {Privacy in Statistical Databases}, volume = {8744}, year = {2014}, pages = {253{\textendash}268}, publisher = {Springer}, organization = {Springer}, doi = {10.1007/978-3-319-11257-2_20}, url = {http://link.springer.com/chapter/10.1007/978-3-319-11257-2_20}, author = {Steorts, R. and Ventura, S. and Sadinle, M. and Fienberg, S. E. and Domingo-Ferrer, J.} } @article {1743, title = {A Comparison of Spatial Predictors when Datasets Could be Very Large}, journal = {ArXiv}, year = {2014}, abstract = {

In this article, we review and compare a number of methods of spatial prediction. To demonstrate the breadth of available choices, we consider both traditional and more-recently-introduced spatial predictors. Specifically, in our exposition we review: traditional stationary kriging, smoothing splines, negative-exponential distance-weighting, Fixed Rank Kriging, modified predictive processes, a stochastic partial differential equation approach, and lattice kriging. This comparison is meant to provide a service to practitioners wishing to decide between spatial predictors. Hence, we provide technical material for the unfamiliar, which includes the definition and motivation for each (deterministic and stochastic) spatial predictor. We use a benchmark dataset of\ CO2\ data from NASA{\textquoteright}s AIRS instrument to address computational efficiencies that include CPU time and memory usage. Furthermore, the predictive performance of each spatial predictor is assessed empirically using a hold-out subset of the AIRS data.

}, keywords = {Statistics - Methodology}, url = {http://arxiv.org/abs/1410.7748}, author = {Bradley, J.~R. and Cressie, N. and Shi, T.} } @article {nagle2014dasymetric, title = {Dasymetric Modeling and Uncertainty}, journal = {The Annals of the Association of American Geographers}, volume = {104}, number = {1}, year = {2014}, pages = {80-95}, doi = {DOI: 10.1080/00045608.2013.843439}, url = {http://www.tandfonline.com/doi/abs/10.1080/00045608.2013.843439}, author = {Nagle, N. and Buttenfield, B. and Leyk, S. and Spielman, S. E.} } @mastersthesis {rose2014diss, title = {Data Fusion Methods for Improved Demographic Resolution of Population Distribution Datasets (Ph.D. Thesis)}, year = {2014}, school = {University of Tennessee}, type = {phd}, author = {Rose, A.} } @conference {2168, title = {Data Quality among Devices to Complete Surveys: Comparing Personal Computers, Smartphones and Tablets}, booktitle = {Midwest Association for Public Opinion Research Annual Meeting}, year = {2014}, address = {Chicago, IL}, url = {http://www.mapor.org/conferences.html}, author = {Wang, Mengyang and McCutcheon, Allan L.} } @article {2416, title = {Deprivation Among U.S. Children With Disabilities Who Receive Supplemental Security Income}, journal = {Journal of Disability Policy Studies}, year = {2014}, doi = {10.1177/1044207314539011}, author = {Ghosth, S. and Parish, S. L.} } @conference {2136, title = {Designing an Intelligent Time Diary Instrument: Visualization, Dynamic Feedback, and Error Prevention and Mitigation}, booktitle = {UNL/SRAM/Gallup Symposium}, year = {2014}, address = {Omaha, NE}, url = {http://grc.unl.edu/unlsramgallup-symposium}, author = {Atkin, G. and Arunachalam, H. and Eck, A. and Soh, L.-K. and Belli, R.F.} } @conference {2135, title = {Designing an Intelligent Time Diary Instrument: Visualization, Dynamic Feedback, and Error Prevention and Mitigation}, booktitle = {American Association for Public Opinion Research 2014 Annual Conference}, year = {2014}, address = {Anaheim, CA. }, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Atkin, G. and Arunachalam, H. and Eck, A. and Soh, L.-K. and Belli, R.} } @article {sad:2014, title = {Detecting Duplicates in a Homicide Registry Using a Bayesian Partitioning Approach}, journal = {Annals of Applied Statistics}, volume = {8}, number = {4}, year = {2014}, pages = {2404{\textendash}2434}, author = {Sadinle, M.} } @inbook {1576, title = {Disclosure risk evaluation for fully synthetic data}, booktitle = {Privacy in Statistical Databases}, volume = {8744}, year = {2014}, pages = {185-199}, publisher = {Springer}, organization = {Springer}, address = {Heidelberg}, author = {J. Hu and J.P. Reiter and Q. Wang} } @article {acq:tay:2014, title = {The Economics of Privacy}, journal = {Journal of Economic Literature}, year = {2014}, note = {Commissioned article. To appear}, author = {Acquisti, A. and Taylor, C.} } @conference {2159, title = {The Effect of CATI Questionnaire Design Features on Response Timing}, booktitle = {American Association for Public Opinion Research 2014 Annual Conference}, year = {2014}, address = {Anaheim, CA}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Olson, K. and Smyth, Jolene} } @conference {2153, title = {The effects of unfamiliar terms on interviewer and respondent behaviors: Are subsequent questions affected?}, booktitle = {Paper presented at the Midwest Association for Public Opinion Research annual meeting}, year = {2014}, month = {11/2014}, address = {Chicago, IL}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Lee, J. and Olson, K.} } @inbook {1601, title = {Enabling statistical analysis of suppressed tabular data, in Privacy in Statistical Databases}, booktitle = {Lecture Notes in Computer Science}, volume = {8744}, year = {2014}, pages = {1-10}, publisher = {Springer}, organization = {Springer}, address = {Heidelberg}, author = {L. Cox} } @article {2014arXiv1409.0643S, title = {{Entity Resolution with Empirically Motivated Priors}}, journal = {ArXiv }, number = {1409.0643}, year = {2014}, abstract = {Databases often contain corrupted, degraded, and noisy data with duplicate entries across and within each database. Such problems arise in citations, medical databases, genetics, human rights databases, and a variety of other applied settings. The target of statistical inference can be viewed as an unsupervised problem of determining the edges of a bipartite graph that links the observed records to unobserved latent entities. Bayesian approaches provide attractive benefits, naturally providing uncertainty quantification via posterior probabilities. We propose a novel record linkage approach based on empirical Bayesian principles. Specifically, the empirical Bayesian--type step consists of taking the empirical distribution function of the data as the prior for the latent entities. This approach improves on the earlier HB approach not only by avoiding the prior specification problem but also by allowing both categorical and string-valued variables. Our extension to string-valued variables also involves the proposal of a new probabilistic mechanism by which observed record values for string fields can deviate from the values of their associated latent entities. Categorical fields that deviate from their corresponding true value are simply drawn from the empirical distribution function. We apply our proposed methodology to a simulated data set of German names and an Italian household survey, showing our method performs favorably compared to several standard methods in the literature. We also consider the robustness of our methods to changes in the hyper-parameters.}, keywords = {Statistics - Methodology}, url = {http://arxiv.org/abs/1409.0643}, author = {Steorts, R.~C.} } @conference {HolanMcElroy2014, title = {Fast Estimation of Time Series with Multiple Long-Range Persistencies}, booktitle = {ASA Proceedings of the Joint Statistical Meetings}, year = {2014}, publisher = {American Statistical Association}, organization = {American Statistical Association}, address = {Alexandria, VA}, author = {McElroy, T.S. and Holan, S.H.} } @conference {Porter2014, title = {Flexible Bayesian Methodology for Multivariate Spatial Small Area Estimation}, booktitle = {Joint Statistical Meetings 2014}, year = {2014}, month = {August}, address = {Boston, MA}, author = {Porter, A.T.} } @techreport {2054, title = {Flexible prior specification for partially identified nonlinear regression with binary responses}, number = {1407.8430}, year = {2014}, institution = {arXiv}, abstract = {This paper adapts tree-based Bayesian regression models for estimating a partially identified probability function. In doing so, ideas from the recent literature on Bayesian partial identification are applied within a sophisticated applied regression context. Our approach permits efficient sensitivity analysis concerning the posterior impact of priors over the partially identified component of the regression model. The new methodology is illustrated on an important problem where we only have partially observed data -- inferring the prevalence of accounting misconduct among publicly traded U.S. businesses.}, url = {https://arxiv.org/abs/1407.8430v1}, author = {P. R. Hahn and J. S. Murray and I. Manolopoulou} } @conference {Quick2014a, title = {A Fully Bayesian Approach for Generating Synthetic Marks and Geographies for Confidential Data}, booktitle = {International Indian Statistical Association}, year = {2014}, month = {July}, publisher = {IISA}, organization = {IISA}, author = {Quick, H.} } @article {1594, title = {The generalized multiset sampler}, journal = {Journal of Computational and Graphical Statistics}, year = {2014}, month = {10/2014}, doi = {10.1080/10618600.2014.962701}, url = { http://dx.doi.org/10.1080/10618600.2014.962701 }, author = {H. Kim and S. N. MacEachern} } @conference {2151, title = {{\textquoteleft}Good Respondent, Bad Respondent{\textquoteright}? Assessing Response Quality in Internet Surveys}, booktitle = {American Association for Public Opinion Research 2014 Annual Conference}, year = {2014}, month = {05/2014}, address = {Anaheim, CA}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Kirchner, A. and Powell, R.} } @article {1800, title = {Harnessing Naturally Occurring Data to Measure the Response of Spending to Income}, journal = {Science}, volume = {345}, year = {2014}, chapter = {212-215}, abstract = {This paper presents a new data infrastructure for measuring economic activity. The infrastructure records transactions and account balances, yielding measurements with scope and accuracy that have little precedent in economics. The data are drawn from a diverse population that overrepresents males and younger adults but contains large numbers of underrepresented groups. The data infrastructure permits evaluation of a benchmark theory in economics that predicts that individuals should use a combination of cash management, saving, and borrowing to make the timing of income irrelevant for the timing of spending. As in previous studies and in contrast to the predictions of the theory, there is a response of spending to the arrival of anticipated income. The data also show, however, that this apparent excess sensitivity of spending results largely from the coincident timing of regular income and regular spending. The remaining excess sensitivity is concentrated among individuals with less liquidity. Link to data at Berkeley Econometrics Lab (EML): https://eml.berkeley.edu/cgi-bin/HarnessingDataScience2014.cgi}, doi = {10.1126/science.1247727}, url = {http://www.sciencemag.org/content/345/6193/212.full}, author = {Gelman, M. and Kariv, S. and Shapiro, M.D. and Silverman, D. and Tadelis, S.} } @conference {2165, title = {Having a Lasting Impact: The Effects of Interviewer Errors on Data Quality}, booktitle = {Midwest Association for Public Opinion Research Annual Conference}, year = {2014}, address = {Chicago, IL}, url = {http://www.mapor.org/conferences.html}, author = {Timm, A. and Olson, K. and Smyth, J.D.} } @inbook {ven:nug:fuc:2014, title = {Hierarchical Linkage Clustering with Distributions of Distances for Large Scale Record Linkage}, booktitle = {Privacy in Statistical Databases (Lecture Notes in Computer Science}, volume = {8744}, year = {2014}, pages = {283{\textendash}298}, publisher = {Springer}, organization = {Springer}, author = {Ventura, S. and Nugent, R. and Fuchs, E.}, editor = {Domingo-Ferrer, J.} } @conference {2142, title = {Hours or Minutes: Does One Unit Fit All?}, booktitle = {Midwest Association for Public Opinion Research Annual Conference}, year = {2014}, address = {Chicago, IL}, url = {http://www.mapor.org/conferences.html}, author = {Cochran, B. and Smyth, J.D.} } @article {spielmannagle2014citylab, title = {How to Make a Better Map{\textemdash}Using Neuroscience}, year = {2014}, publisher = {Citylab}, type = {Online}, abstract = {

The work of Seth Spielman and Nicholas Nagle was noted in this article in City Lab, a publication from The Atlantic magazine, available at http://www.citylab.com/design/2014/11/how-to-make-a-better-map-according-to-science/382898/.

}, keywords = {Nicholas Nagle, Seth Spielman}, url = {http://www.citylab.com/design/2014/11/how-to-make-a-better-map-according-to-science/382898/}, author = {Laura Bliss} } @article {pee:acq:sha:2014, title = {I Cheated, but only a Little{\textendash}Partial Confessions to Unethical Behavior}, journal = {Journal of Personality and Social Psychology}, volume = {106}, number = {2}, year = {2014}, pages = {202{\textendash}217}, author = {Peer, E. and Acquisti, A. and Shalvi, S.} } @article {folch2014identifying, title = {Identifying Regions based on Flexible User Defined Constraints}, journal = {International Journal of Geographic Information Science}, volume = {28}, number = {1}, year = {2014}, pages = {164-184}, doi = {10.1080/13658816.2013.848986}, url = {http://www.tandfonline.com/doi/abs/10.1080/13658816.2013.848986}, author = {Folch, D. and Spielman, S. E.} } @article {Paivaforthcoming, title = {Imputation of confidential data sets with spatial locations using disease mapping models}, journal = {Statistics in Medicine}, volume = {33}, year = {2014}, pages = {1928-1945}, author = {T. Paiva and A. Chakraborty and J.P. Reiter and A.E. Gelfand} } @techreport {1578, title = {Interval Estimates for Official Statistics with Survey Nonresponse}, year = {2014}, author = {Manski, C.} } @conference {2137, title = {Interviewer variance and prevalence of verbal behaviors in calendar and conventional interviewing}, booktitle = {American Association for Public Opinion Research 2014 Annual Conference}, year = {2014}, address = {Anaheim, CA}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Belli, R.F. and Charoenruk, N.,} } @conference {2138, title = {Interviewer variance of interviewer and respondent behaviors: A comparison between calendar and conventional interviewing}, booktitle = {XVIII International Sociological Association World Congress of Sociology}, year = {2014}, address = {Yokohama, Japan}, url = {https://isaconf.confex.com/isaconf/wc2014/webprogram/Paper34278.html}, author = {Belli, R.F. and Charoenruk, N.,} } @article {1598, title = {Longitudinal mixed membership trajectory models for disability survey data}, journal = {Annals of Applied Statistics}, volume = {8}, year = {2014}, pages = {2268-2291}, chapter = {2268}, author = {Manrique-Vallier, D} } @conference {2149, title = {Making sense of paradata: Challenges faced and lessons learned}, booktitle = {American Association for Public Opinion Research 2014 Annual Conference}, year = {2014}, address = {Anaheim, CA}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Eck, A. and Stuart, L. and Atkin, G. and Soh, L-K and McCutcheon, A.L. and Belli, R.F.} } @conference {2148, title = {Making Sense of Paradata: Challenges Faced and Lessons Learned}, booktitle = {UNL/SRAM/Gallup Symposium}, year = {2014}, address = {Omaha, NE}, url = {http://grc.unl.edu/unlsramgallup-symposium}, author = {Eck, A. and Stuart, L. and Atkin, G. and Soh, L-K and McCutcheon, A.L. and Belli, R.F.} } @article {Fan2013, title = {Multiple imputation by ordered monotone blocks with application to the Anthrax Vaccine Adsorbed Trial}, journal = {Journal of Computational and Graphical Statistics}, volume = {23}, number = {ja}, year = {2014}, pages = {877-892}, doi = {10.1080/10618600.2013.826583}, url = {http://www.tandfonline.com/doi/abs/10.1080/10618600.2013.826583}, author = {Li, Fan and Baccini, Michela and Mealli, Fabrizia and Zell, Elizabeth R. and Frangakis, Constantine E. and Rubin, Donald B} } @mastersthesis {paivathesis, title = {Multiple Imputation Methods for Nonignorable Nonresponse, Adaptive Survey Design, and Dissemination of Synthetic Geographies (Ph.D. thesis)}, volume = {Ph.D.}, year = {2014}, school = {Duke University}, type = {phd}, url = {http://dukespace.lib.duke.edu/dspace/handle/10161/9406}, author = {Thais Paiva} } @article {1518, title = {Multiple imputation of missing or faulty values under linear constraints}, journal = {Journal of Business and Economic Statistics}, volume = {32}, year = {2014}, pages = {375-386}, chapter = {375}, abstract = {

Many statistical agencies, survey organizations, and research centers collect data that suffer from item nonresponse and erroneous or inconsistent values. These data may be required to satisfy linear constraints, for example, bounds on individual variables and inequalities for ratios or sums of variables. Often these constraints are designed to identify faulty values, which then are blanked and imputed. The data also may exhibit complex distributional features, including nonlinear relationships and highly nonnormal distributions. We present a fully Bayesian, joint model for modeling or imputing data with missing/blanked values under linear constraints that (i) automatically incorporates the constraints in inferences and imputations, and (ii) uses a flexible Dirichlet process mixture of multivariate normal distributions to reflect complex distributional features. Our strategy for estimation is to augment the observed data with draws from a hypothetical population in which the constraints are not present, thereby taking advantage of computationally expedient methods for fitting mixture models. Missing/blanked items are sampled from their posterior distribution using the Hit-and-Run sampler, which guarantees that all imputations satisfy the constraints. We illustrate the approach using manufacturing data from Colombia, examining the potential to preserve joint distributions and a regression from the plant productivity literature. Supplementary materials for this article are available online.

}, doi = {10.1080/07350015.2014.885435}, author = {Kim, H. J. and Reiter, J. P. and Wang, Q. and Cox, L. H. and Karr, A. F.} } @techreport {handle:1813:45868, title = {NCRN Meeting Fall 2014}, number = {1813:45868}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Fall 2014 Vilhuber, Lars Taken place at the ILR NYC Conference Center.}, url = {http://hdl.handle.net/1813/45868}, author = {Vilhuber, Lars} } @techreport {handle:1813:37750, title = {NCRN Meeting Fall 2014: Bayesian Marked Point Process Modeling for Generating Fully Synthetic Public Use Data with Point-Referenced Geography}, number = {1813:37750}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Fall 2014: Bayesian Marked Point Process Modeling for Generating Fully Synthetic Public Use Data with Point-Referenced Geography Quick, Harrison; Holan, Scott; Wikle, Christopher; Reiter, Jerry Presentation from NCRN Fall 2014 meeting}, url = {http://hdl.handle.net/1813/37750}, author = {Quick, Harrison and Holan, Scott and Wikle, Christopher and Reiter, Jerry} } @techreport {handle:1813:37446, title = {NCRN Meeting Fall 2014: Change in Visible Impervious Surface Area in Southeastern Michigan Before and After the \"Great Recession\"}, number = {1813:37446}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Fall 2014: Change in Visible Impervious Surface Area in Southeastern Michigan Before and After the \"Great Recession\" Wilson, Courtney; Brown, Daniel G. Presentation at Fall 2014 NCRN meeting}, url = {http://hdl.handle.net/1813/37446}, author = {Wilson, Courtney and Brown, Daniel G.} } @techreport {handle:1813:37748, title = {NCRN Meeting Fall 2014: Constrained Smoothed Bayesian Estimation}, number = {1813:37748}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Fall 2014: Constrained Smoothed Bayesian Estimation Steorts, Rebecca; Shalizi, Cosma Presentation from NCRN Fall 2014 meeting}, url = {http://hdl.handle.net/1813/37748}, author = {Steorts, Rebecca and Shalizi, Cosma} } @techreport {handle:1813:37411, title = {NCRN Meeting Fall 2014: Decomposing Medical-Care Expenditure Growth}, number = {1813:37411}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Fall 2014: Decomposing Medical-Care Expenditure Growth Dunn, Abe; Liebman, Eli; Shapiro, Adam}, url = {http://hdl.handle.net/1813/37411}, author = {Dunn, Abe and Liebman, Eli and Shapiro, Adam} } @techreport {handle:1813:37747, title = {NCRN Meeting Fall 2014: Designer Census Geographies}, number = {1813:37747}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Fall 2014: Designer Census Geographies Spielman, Seth Presentation from NCRN Fall 2014 meeting}, url = {http://hdl.handle.net/1813/37747}, author = {Spielman, Seth} } @techreport {handle:1813:37412, title = {NCRN Meeting Fall 2014: Geographic linkages between National Center for Health Statistics{\textquoteright} population health surveys and air quality measures}, number = {1813:37412}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Fall 2014: Geographic linkages between National Center for Health Statistics{\textquoteright} population health surveys and air quality measures Parker, Jennifer}, url = {http://hdl.handle.net/1813/37412}, author = {Parker, Jennifer} } @techreport {handle:1813:37749, title = {NCRN Meeting Fall 2014: Mixed Effects Modeling for Multivariate-Spatio-Temporal Areal Data}, number = {1813:37749}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Fall 2014: Mixed Effects Modeling for Multivariate-Spatio-Temporal Areal Data Bradley, Jonathan; Holan, Scott; Wikle, Christopher Presentation from NCRN Fall 2014 meeting}, url = {http://hdl.handle.net/1813/37749}, author = {Bradley, Jonathan and Holan, Scott and Wikle, Christopher} } @techreport {handle:1813:37414, title = {NCRN Meeting Fall 2014: Respondent-Driven Sampling Estimation and the National HIV Behavioral Surveillance System}, number = {1813:37414}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Fall 2014: Respondent-Driven Sampling Estimation and the National HIV Behavioral Surveillance System Spiller, Michael (Trey)}, url = {http://hdl.handle.net/1813/37414}, author = {Spiller, Michael (Trey)} } @techreport {handle:1813:45869, title = {NCRN Meeting Spring 2014}, number = {1813:45869}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2014 Vilhuber, Lars Taken place at the Census Headquarters, Washington, DC.}, url = {http://hdl.handle.net/1813/45869}, author = {Vilhuber, Lars} } @techreport {handle:1813:36393, title = {NCRN Meeting Spring 2014: Adaptive Protocols and the DDI 4 Process Model}, number = {1813:36393}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2014: Adaptive Protocols and the DDI 4 Process Model Greenfield, Jay; Kuan, Sophia Presentation from NCRN Spring 2014 meeting}, url = {http://hdl.handle.net/1813/36393}, author = {Greenfield, Jay and Kuan, Sophia} } @techreport {handle:1813:36397, title = {NCRN Meeting Spring 2014: Aiming at a More Cost-Effective Census Via Online Data Collection: Privacy Trade-Offs of Geo-Location}, number = {1813:36397}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2014: Aiming at a More Cost-Effective Census Via Online Data Collection: Privacy Trade-Offs of Geo-Location Brandimarte, Laura; Acquisti, Alessandro presentation at NCRN Spring 2014 meeting}, url = {http://hdl.handle.net/1813/36397}, author = {Brandimarte, Laura and Acquisti, Alessandro} } @techreport {handle:1813:36399, title = {NCRN Meeting Spring 2014: Imputation of multivariate continuous data with non-ignorable missingness}, number = {1813:36399}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2014: Imputation of multivariate continuous data with non-ignorable missingness Paiva, Thais; Reiter, Jerry Presentation at Spring 2014 NCRN meeting}, url = {http://hdl.handle.net/1813/36399}, author = {Paiva, Thais and Reiter, Jerry} } @techreport {handle:1813:36392, title = {NCRN Meeting Spring 2014: Integrating PROV with DDI: Mechanisms of Data Discovery within the U.S. Census Bureau}, number = {1813:36392}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2014: Integrating PROV with DDI: Mechanisms of Data Discovery within the U.S. Census Bureau Block, William; Brown, Warren; Williams, Jeremy; Vilhuber, Lars; Lagoze, Carl presentation at NCRN Spring 2014 meeting}, url = {http://hdl.handle.net/1813/36392}, author = {Block, William and Brown, Warren and Williams, Jeremy and Vilhuber, Lars and Lagoze, Carl} } @techreport {handle:1813:36395, title = {NCRN Meeting Spring 2014: Introduction}, number = {1813:36395}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2014: Introduction Thompson, John NCRN Spring 2014 Meeting}, url = {http://hdl.handle.net/1813/36395}, author = {Thompson, John} } @techreport {handle:1813:36394, title = {NCRN Meeting Spring 2014: Metadata Standards \& Technology Development for the NSF Survey of Earned Doctorates}, number = {1813:36394}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2014: Metadata Standards \& Technology Development for the NSF Survey of Earned Doctorates Noonan, Kimberly; Heus, Pascal; Mulcahy, Tim Presentation from NCRN Spring 2014 meeting}, url = {http://hdl.handle.net/1813/36394}, author = {Noonan, Kimberly and Heus, Pascal and Mulcahy, Tim} } @techreport {handle:1813:36400, title = {NCRN Meeting Spring 2014: Research Program and Enterprise Architecture for Adaptive Survey Design at Census}, number = {1813:36400}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2014: Research Program and Enterprise Architecture for Adaptive Survey Design at Census Miller, Peter; Mathur, Anup; Thieme, Michael}, url = {http://hdl.handle.net/1813/36400}, author = {Miller, Peter and Mathur, Anup and Thieme, Michael} } @techreport {handle:1813:36396, title = {NCRN Meeting Spring 2014: Summer Working Group for Employer List Linking (SWELL)}, number = {1813:36396}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2014: Summer Working Group for Employer List Linking (SWELL) Gathright, Graton; Kutzbach, Mark; Mccue, Kristin; McEntarfer, Erika; Monti, Holly; Trageser, Kelly; Vilhuber, Lars; Wasi, Nada; Wignall, Christopher Presentation for NCRN Spring 2014 meeting}, url = {http://hdl.handle.net/1813/36396}, author = {Gathright, Graton and Kutzbach, Mark and Mccue, Kristin and McEntarfer, Erika and Monti, Holly and Trageser, Kelly and Vilhuber, Lars and Wasi, Nada and Wignall, Christopher} } @techreport {handle:1813:36398, title = {NCRN Meeting Spring 2014: Web Surveys, Online Panels, and Paradata: Automating Adaptive Design}, number = {1813:36398}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2014: Web Surveys, Online Panels, and Paradata: Automating Adaptive Design McCutcheon, Allan Presentation at NCRN Spring 2014 meeting}, url = {http://hdl.handle.net/1813/36398}, author = {McCutcheon, Allan} } @techreport {handle:1813:40233, title = {NCRN Newsletter: Volume 1 - Issue 2}, number = {1813:40233}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Newsletter: Volume 1 - Issue 2 Vilhuber, Lars; Karr, Alan; Reiter, Jerome; Abowd, John; Nunnelly, Jamie Overview of activities at NSF-Census Research Network nodes from November 2013 to March 2014. NCRN Newsletter Vol. 1, Issue 2: March 20, 2014}, url = {http://hdl.handle.net/1813/40233}, author = {Vilhuber, Lars and Karr, Alan and Reiter, Jerome and Abowd, John and Nunnelly, Jamie} } @techreport {handle:1813:40234, title = {NCRN Newsletter: Volume 1 - Issue 3}, number = {1813:40234}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Newsletter: Volume 1 - Issue 3 Vilhuber, Lars; Karr, Alan; Reiter, Jerome; Abowd, John; Nunnelly, Jamie Overview of activities at NSF-Census Research Network nodes from March 2014 to July 2014. NCRN Newsletter Vol. 1, Issue 3: July 23, 2014}, url = {http://hdl.handle.net/1813/40234}, author = {Vilhuber, Lars and Karr, Alan and Reiter, Jerome and Abowd, John and Nunnelly, Jamie} } @techreport {handle:1813:40192, title = {NCRN Newsletter: Volume 1 - Issue 4}, number = {1813:40192}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Newsletter: Volume 1 - Issue 4 Vilhuber, Lars; Karr, Alan; Reiter, Jerome; Abowd, John; Nunnelly, Jamie Overview of activities at NSF-Census Research Network nodes from July 2014 to October 2014. NCRN Newsletter Vol. 1, Issue 4: October 15, 2014}, url = {http://hdl.handle.net/1813/40192}, author = {Vilhuber, Lars and Karr, Alan and Reiter, Jerome and Abowd, John and Nunnelly, Jamie} } @techreport {handle:1813:40828, title = {A New Method for Protecting Interrelated Time Series with Bayesian Prior Distributions and Synthetic Data}, number = {1813:40828}, year = {2014}, institution = {Cornell University}, type = {Preprint}, abstract = {A New Method for Protecting Interrelated Time Series with Bayesian Prior Distributions and Synthetic Data Schneider, Matthew J.; Abowd, John M. Organizations disseminate statistical summaries of administrative data via the Web for unrestricted public use. They balance the trade-off between confidentiality protection and inference quality. Recent developments in disclosure avoidance techniques include the incorporation of synthetic data, which capture the essential features of underlying data by releasing altered data generated from a posterior predictive distribution. The United States Census Bureau collects millions of interrelated time series micro-data that are hierarchical and contain many zeros and suppressions. Rule-based disclosure avoidance techniques often require the suppression of count data for small magnitudes and the modification of data based on a small number of entities. Motivated by this problem, we use zero-inflated extensions of Bayesian Generalized Linear Mixed Models (BGLMM) with privacy-preserving prior distributions to develop methods for protecting and releasing synthetic data from time series about thousands of small groups of entities without suppression based on the of magnitudes or number of entities. We find that as the prior distributions of the variance components in the BGLMM become more precise toward zero, confidentiality protection increases and inference quality deteriorates. We evaluate our methodology using a strict privacy measure, empirical differential privacy, and a newly defined risk measure, Probability of Range Identification (PoRI), which directly measures attribute disclosure risk. We illustrate our results with the U.S. Census Bureau{\textquoteright}s Quarterly Workforce Indicators.}, url = {http://hdl.handle.net/1813/40828}, author = {Schneider, Matthew J. and Abowd, John M.} } @article {1799, title = {NewsViews: An Automated Pipeline for Creating Custom Geovisualizations for News}, year = {2014}, abstract = {Interactive visualizations add rich, data-based context to online news articles. Geographic maps are currently the most prevalent form of these visualizations. Unfortunately, designers capable of producing high-quality, customized geovisualizations are scarce. We present NewsViews, a novel automated news visualization system that generates interactive, annotated maps without requiring professional designers. NewsViews{\textquoteright} maps support trend identification and data comparisons relevant to a given news article. The NewsViews system leverages text mining to identify key concepts and locations discussed in articles (as well as po-tential annotations), an extensive repository of {\textquotedblleft}found{\textquotedblright} databases, and techniques adapted from cartography to identify and create visually {\textquotedblleft}interesting{\textquotedblright} thematic maps. In this work, we develop and evaluate key criteria in automatic, annotated, map generation and experimentally validate the key features for successful representations (e.g., relevance to context, variable selection, "interestingness" of representation and annotation quality). }, doi = {10.1145/2556288.2557228}, url = {http://cond.org/newsviews.html }, author = {Gao, T. and Hullman, J. and Adar, E. and Hect, B. and Diakopoulos, N.} } @article {singleton2014geodem, title = {The Past, Present, and Future of Geodemographic Research in the Unites States and United Kingdom}, journal = {The Professional Geographer}, volume = {4}, year = {2014}, author = {Singleton, A. and Spielman, S. E.} } @conference {Bradley2014, title = {The Poisson Change of Support Problem with Applications to the American Community Survey}, booktitle = {Joint Statistical Meetings 2014}, year = {2014}, author = {Bradley, J.R.} } @conference {2155, title = {Predicting Survey Breakoff in Online Survey Panels}, booktitle = {American Association for Public Opinion Research 2014 Annual Conference}, year = {2014}, address = {Anaheim, CA}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {McCutcheon, A.L.} } @techreport {handle:1813:38121, title = {Reducing Uncertainty in the American Community Survey through Data-Driven Regionalization}, number = {1813:38121}, year = {2014}, institution = {University of Colorado at Boulder / University of Tennessee}, type = {Preprint}, abstract = {Reducing Uncertainty in the American Community Survey through Data-Driven Regionalization Spielman, Seth; Folch, David The American Community Survey (ACS) is the largest US survey of households and is the principal source for neighborhood scale information about the US population and economy. The ACS is used to allocate billions in federal spending and is a critical input to social scientific research in the US. However, estimates from the ACS can be highly unreliable. For example, in over 72\% of census tracts the estimated number of children under 5 in poverty has a margin of error greater than the estimate. Uncertainty of this magnitude complicates the use of social data in policy making, research, and governance. This article develops a spatial optimization algorithm that is capable of reducing the margins of error in survey data via the creation of new composite geographies, a process called regionalization. Regionalization is a complex combinatorial problem. Here rather than focusing on the technical aspects of regionalization we demonstrate how to use a purpose built open source regionalization algorithm to post-process survey data in order to reduce the margins of error to some user-specified threshold.}, url = {http://hdl.handle.net/1813/38121}, author = {Spielman, Seth and Folch, David} } @conference {2146, title = {Remembering where: A look at the American Time Use Survey}, booktitle = {Paper presented at the annual conference of the Midwest Association for Public Opinion Research}, year = {2014}, month = {11/2014}, address = {Chicago, IL}, url = {http://www.mapor.org/conferences.html}, author = {Deal, C. and Cordova-Cazar, A.L. and Countryman, A. and Kirchner, A. and Belli, R.F.} } @article {pee:vos:acq:2014, title = {Reputation as a Sufficient Condition for Data Quality on Amazon Mechanical Turk}, journal = {Behavior Research Methods}, volume = {46}, number = {4}, year = {2014}, month = {December}, pages = {1023{\textendash}1031}, author = {Peer, E. and Vosgerau, J. and Acquisti, A.} } @inbook {2411, title = {The Rise of Incarceration Among the Poor with Mental Illnesses: How Neoliberal Policies Contribute}, booktitle = {The Routledge Handbook of Poverty in the United States}, year = {2014}, publisher = {Routledge}, organization = {Routledge}, author = {Camp, J. and Haymes, S. and Haymes, M. V. d. and Miller, R.J.} } @conference {2156, title = {The Role of Device Type in Internet Panel Survey Breakoff}, booktitle = {Midwest Association for Public Opinion Research Annual Conference}, year = {2014}, address = {Chicago, IL}, url = {http://www.mapor.org/conferences.html}, author = {McCutcheon, A.L.} } @article {1798, title = {Savings from ages 16 to 35: A test to inform Child Development Account policy}, journal = {Poverty \& Public Policy}, volume = {6}, year = {2014}, chapter = {46-70}, doi = {10.1002/pop4.59 }, url = {http://onlinelibrary.wiley.com/store/10.1002/pop4.59/asset/pop459.pdf}, author = {Friedline, T. and Nam, I.} } @article {Ventura2014, title = {Seeing the Non-Stars: (Some) Sources of Bias in Past Disambiguation Approaches and a New Public Tool Leveraging Labeled Records}, journal = {Research Policy}, year = {2014}, note = {Selected for Special Issue on Big Data}, month = {December}, author = {Ventura, S. and Nugent, R. and Fuchs, E.} } @booklet {Belli, title = {SIPP: From Conventional Questionnaire to Event History Calendar Interviewing}, year = {2014}, note = {Workshop on \ìConducting Research using the Survey of Income and Program Participation (SIPP). Presented at Duke University, Social Science Research Institute, Durham, NC}, month = {February}, author = {Belli, R.F.} } @conference {ste:hal:fie:2014, title = {SMERED: A Bayesian Approach to Graphical Record Linkage and De-duplication}, booktitle = {AISTATS 2014 Proceedings, JMLR}, volume = {33}, year = {2014}, pages = {922{\textendash}930}, publisher = {W\& CP}, organization = {W\& CP}, author = {Steorts, R. and Hall, R. and Fienberg, S. E.} } @techreport {handle:1813:52607, title = {Sorting Between and Within Industries: A Testable Model of Assortative Matching}, number = {1813:52607}, year = {2014}, institution = {Cornell University}, type = {Preprint}, abstract = {Sorting Between and Within Industries: A Testable Model of Assortative Matching Abowd, John M.; Kramarz, Francis; Perez-Duarte, Sebastien; Schmutte, Ian M. We test Shimer{\textquoteright}s (2005) theory of the sorting of workers between and within industrial sectors based on directed search with coordination frictions, deliberately maintaining its static general equilibrium framework. We fit the model to sector-specific wage, vacancy and output data, including publicly-available statistics that characterize the distribution of worker and employer wage heterogeneity across sectors. Our empirical method is general and can be applied to a broad class of assignment models. The results indicate that industries are the loci of sorting{\textendash}more productive workers are employed in more productive industries. The evidence confirms that strong assortative matching can be present even when worker and employer components of wage heterogeneity are weakly correlated.}, url = {http://hdl.handle.net/1813/52607}, author = {Abowd, John M. and Kramarz, Francis and Perez-Duarte, Sebastien and Schmutte, Ian M.} } @article {spielman2014spatial, title = {Spatial Collective Intelligence? Accuracy, Credibility in Crowdsourced Data}, journal = {Cartography and Geographic Information Science}, volume = {41}, number = {2}, year = {2014}, pages = {115-124}, doi = {http://dx.doi.org/10.1080/15230406.2013.874200}, url = {http://go.galegroup.com/ps/i.do?action=interpret\&id=GALE|A361943563\&v=2.1\&u=nysl_sc_cornl\&it=r\&p=AONE\&sw=w\&authCount=1}, author = {Spielman, S. E.} } @booklet {Holan2014e, title = {Spatial Fay-Herriot Models for Small Area Estimation With Functional Covariates}, year = {2014}, month = {January}, author = {Holan, S.H.} } @article {Porter2014a, title = {Spatial Fay-Herriot Models for Small Area Estimation with Functional Covariates}, journal = {Spatial Statistics}, volume = {10}, year = {2014}, pages = {27-42}, url = {http://arxiv.org/pdf/1303.6668v3.pdf}, author = {Porter, A. T., and Holan, S.H., and Wikle, C.K., and Cressie, N.} } @conference {bal:bra:acq:2014, title = {Spiny CACTOS: OSN Users Attitudes and Perceptions Towards Cryptographic Access Control Tools}, booktitle = {Proceedings of the Workshop on Usable Security (USEC)}, year = {2014}, url = {https://www.internetsociety.org/doc/spiny-cactos-osn-users-attitudes-and-perceptions-towards-cryptographic-access-control-tools}, author = {Balsa, E., and Brandimarte, L., and Acquisti, A., and Diaz, C., and G{\"u}rses, S.} } @conference {griffin2014supporting2, title = {Supporting Planners{\textquoteright} Work with Uncertain Demographic Data}, booktitle = {GIScience Workshop on Uncertainty Visualization}, volume = {23}, year = {2014}, url = {http://cognitivegiscience.psu.edu/uncertainty2014/papers/griffin_demographic.pdf.}, author = {Griffin, A. L. and Spielman, S. E. and Jurjevich, J. and Merrick, M. and Nagle, N. N. and Folch, D. C.} } @conference {griffin2014supporting, title = {Supporting Planners{\textquoteright} work with Uncertain Demographic Data}, booktitle = {Proceedings of IEEE VIS 2014}, year = {2014}, pages = {9{\textendash}14}, publisher = {Proceedings of IEEE VIS 2014}, organization = {Proceedings of IEEE VIS 2014}, url = {http://cognitivegiscience.psu.edu/uncertainty2014/papers/griffin_demographic.pdf}, author = {Griffin, A. L. and Spielman, S. E. and Nagle, N. N. and Jurjevich, J. and Merrick, M. and Folch, D. C.} } @conference {Bradley2014a, title = {Survey Fusion for Data that Exhibit Multivariate, Spatio-Temporal Dependencies}, booktitle = {Joint Statistical Meetings 2014}, year = {2014}, author = {Bradley, J.R.} } @conference {2147, title = {Survey Informatics: Ideas, Opportunities, and Discussions}, booktitle = {UNL/SRAM/Gallup Symposium}, year = {2014}, address = {Omaha, NE}, url = {http://grc.unl.edu/unlsramgallup-symposium}, author = {Eck, A. and Soh, L-K} } @booklet {Porter2014e, title = {A Survey of Contemporary Spatial Models for Small Area Estimation}, year = {2014}, month = {January}, author = {Porter, A.T.} } @article {1572, title = {SynLBD 2.0: Improving the Synthetic Longitudinal Business Database}, journal = {Statistical Journal of the International Association for Official Statistics}, volume = {30}, year = {2014}, pages = {129-135}, author = {S. K. Kinney and J. P. Reiter and J. Miranda} } @article {cri:edd:2014, title = {Top-Coding and Public Use Microdata Samples from the U.S. Census Bureau}, journal = {Journal of Privacy and Confidentiality}, volume = {6}, number = {2}, year = {2014}, pages = {21{\textendash}58}, url = {http://repository.cmu.edu/jpc/vol6/iss2/2/}, author = {Crimi, N. and Eddy, W. C.} } @article {1797, title = {Toward healthy balance sheets: Savings accounts as a gateway for young adults{\textquoteright} asset diversification and accumulation}, journal = {The St. Louis Federal Reserve Bulletin}, year = {2014}, url = {http://research.stlouisfed.org/publications/review/2014/q4/friedline.pdf}, author = {Friedline, T. and Johnson, P. and Hughes, R.} } @mastersthesis {bellman2014honorsthesis, title = {Towards an Understanding of Dynamics Between Race, Population Movement, and the Built Environment of American Cities (undergraduate honors thesis)}, year = {2014}, school = {University of Colorado at Boulder}, type = {Undergraduate Honors Thesis}, author = {Bellman, B.} } @techreport {2418, title = {Twitter, Big Data, and Jobs Numbers}, year = {2014}, type = {online}, url = {http://www.lsa.umich.edu/lsa/ci.twitterbigdataandjobsnumbers_ci.detail}, author = {Hudomiet, Peter} } @techreport {handle:1813:38122, title = {Uncertain Uncertainty: Spatial Variation in the Quality of American Community Survey Estimates}, number = {1813:38122}, year = {2014}, institution = {University of Colorado at Boulder / University of Tennessee}, type = {Preprint}, abstract = {Uncertain Uncertainty: Spatial Variation in the Quality of American Community Survey Estimates Folch, David C.; Arribas-Bel, Daniel; Koschinsky, Julia; Spielman, Seth E. The U.S. Census Bureau{\textquoteright}s American Community Survey (ACS) is the foundation of social science research, much federal resource allocation and the development of public policy and private sector decisions. However, the high uncertainty associated with some of the ACS{\textquoteright}s most frequently used estimates can jeopardize the accuracy of inferences based on these data. While there is high level understanding in the research community that problems exist in the data, the sources and implications of these problems have been largely overlooked. Using 2006-2010 ACS median household income at the census tract scale as the test case (where a third of small-area estimates have higher than recommend errors), we explore the patterns in the uncertainty of ACS data. We consider various potential sources of uncertainty in the data, ranging from response level to geographic location to characteristics of the place. We find that there exist systematic patterns in the uncertainty in both the spatial and attribute dimensions. Using a regression framework, we identify the factors that are most frequently correlated with the error at national, regional and metropolitan area scales, and find these correlates are not consistent across the various locations tested. The implication is that data quality varies in different places, making cross-sectional analysis both within and across regions less reliable. We also present general advice for data users and potential solutions to the challenges identified.}, url = {http://hdl.handle.net/1813/38122}, author = {Folch, David C. and Arribas-Bel, Daniel and Koschinsky, Julia and Spielman, Seth E.} } @inbook {McCutcheon2014, title = {The Untold Story of Multi-Mode (Online and Mail) Consumer Panels: From Optimal Recruitment to Retention and Attrition}, booktitle = {Online Panel Surveys: An Interdisciplinary Approach}, year = {2014}, publisher = {Wiley}, organization = {Wiley}, doi = {10.1002/9781118763520.ch5}, author = {McCutcheon, Allan L. and Rao, K., and Kaminska, O.}, editor = {Callegaro, M. and Baker, R. and Bethlehem, J. and G{\"o}ritz, A. and Krosnick, J. and Lavrakas, P.} } @article {kimberlin2014updated, title = {An updated method for calculating income and payroll taxes from PSID data using the NBER{\textquoteright}s TAXSIM, for PSID survey years 1999 through 2011}, journal = {Unpublished manuscript, University of Michigan. Accessed May}, volume = {6}, year = {2014}, pages = {2016}, abstract = {This paper describes a method to calculate income and payroll taxes from Panel Study of Income Dynamics data using the NBER's Internet TAXSIM version 9 (http://users.nber.org/~taxsim/taxsim9/), for PSID survey years 1999, 2001, 2003, 2005. 2007, 2009, and 2011 (tax years n-1). These methods are implemented in two Stata programs, designed to be used with the PSID public-use zipped Main Interview data files: PSID_TAXSIM_1of2.do and PSID_TAXSIM_2of2.do. The main program (2of2) was written by Sara Kimberlin (skimberlin@berkeley.edu) and generates all TAXSIM input variables, runs TAXSIM, adjusts tax estimates using additional information available in PSID data, and calculates total PSID family unit taxes. A separate program (1of2) was written by Jiyoon (June) Kim (junekim@umich.edu) in collaboration with Luke Shaefer (lshaefer@umich.edu) to calculate mortgage interest for itemized deductions; this program needs to be run first, before the main program. Jonathan Latner contributed code to use the programs with the PSID zipped data. The overall methods build on the strategy for using TAXSIM with PSID data outlined by Butrica \& Burkhauser (1997), with some expansions and modifications. Note that the methods described below are designed to prioritize accuracy of income taxes calculated for low-income households, particularly refundable tax credits such as the Earned Income Tax Credit (EITC) and the Additional Child Tax Credit. Income tax liability is generally low for low-income households, and the amount of refundable tax credits is often substantially larger than tax liabilities for this population. Payroll tax can also be substantial for low-income households. Thus the methods below focus on maximizing accuracy of income tax and payroll tax calculations for low-income families, with less attention to tax items that largely impact higher-income households (e.g. the treatment of capital gains).}, author = {Kimberlin, Sara and Kim, Jiyoun and Shaefer, Luke} } @conference {2144, title = {The use of paradata (in time use surveys) to better evaluate data quality}, booktitle = {American Association for Public Opinion Research 2014 Annual Conference}, year = {2014}, address = {Anaheim, CA}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Cordova-Cazar, A.L. and Belli, R.F.} } @techreport {handle:1813:40852, title = {Using partially synthetic data to replace suppression in the Business Dynamics Statistics: early results}, number = {1813:40852}, year = {2014}, institution = {Cornell University}, type = {Preprint}, abstract = {Using partially synthetic data to replace suppression in the Business Dynamics Statistics: early results Miranda, Javier; Vilhuber, Lars The Business Dynamics Statistics is a product of the U.S. Census Bureau that provides measures of business openings and closings, and job creation and destruction, by a variety of cross-classifications (firm and establishment age and size, industrial sector, and geography). Sensitive data are currently protected through suppression. However, as additional tabulations are being developed, at ever more detailed geographic levels, the number of suppressions increases dramatically. This paper explores the option of providing public-use data that are analytically valid and without suppressions, by leveraging synthetic data to replace observations in sensitive cells.}, url = {http://hdl.handle.net/1813/40852}, author = {Miranda, Javier and Vilhuber, Lars} } @article {2171, title = {Using Partially Synthetic Data to Replace Suppression in the Business Dynamics Statistics: Early Results}, journal = {Privacy in Statistical Databases}, year = {2014}, pages = {232-242}, abstract = {The Business Dynamics Statistics is a product of the U.S. Census Bureau that provides measures of business openings and closings, and job creation and destruction, by a variety of cross-classifications (firm and establishment age and size, industrial sector, and geography). Sensitive data are currently protected through suppression. However, as additional tabulations are being developed, at ever more detailed geographic levels, the number of suppressions increases dramatically. This paper explores the option of providing public-use data that are analytically valid and without suppressions, by leveraging synthetic data to replace observations in sensitive cells.}, isbn = {978-3-319-11256-5}, doi = {10.1007/978-3-319-11257-2_18}, url = {http://dx.doi.org/10.1007/978-3-319-11257-2_18}, author = {J. Miranda and L. Vilhuber} } @techreport {2410, title = {Using Social Media to Measure Labor Market Flows}, year = {2014}, type = {Mimeo}, url = {http://www-personal.umich.edu/~shapiro/papers/LaborFlowsSocialMedia.pdf}, author = {Antenucci, Dolan and Cafarella, Michael J and Levenstein, Margaret C. and R{\'e}, Christopher and Shapiro, Matthew} } @conference {McCutcheon2014a, title = {Web Surveys, Online Panels, and Paradata: Automating Adaptive Design}, booktitle = {NSF-Census Research Network (NCRN) Spring Meeting}, year = {2014}, note = {Conference on Methodological Innovations in the Study of Elections in Europe and Beyond. Presented at Texas A\&M University}, address = {Washington, DC}, url = {http://www.ncrn.info/event/ncrn-meeting-spring-2014}, author = {McCutcheon, A.L.} } @article {1863, title = {What are You Doing Now? Activity Level Responses and Errors in the American Time Use Survey}, journal = {Journal of Survey Statistics and Methodology}, volume = {2}, year = {2014}, chapter = {519-537}, author = {T. Al Baghal and Belli, R.F. and Phillips, A.L. and Ruther, N.} } @article {1523, title = {Why data availability is such a hard problem}, journal = {Statistical Journal of the International Association for Official Statistics}, volume = {30}, year = {2014}, month = {06/2014}, chapter = {101-107}, abstract = {If data availability were a simple problem, it would already have been resolved. In this paper, I argue that by viewing data availability as a public good, it is possible to both understand the complexities with which it is fraught and identify a path to a solution. }, keywords = {Data Archive, Data availability, public good, replicability, reproducibility}, issn = {1875-9254}, author = {A. F. Karr} } @conference {woo:pih:acq:2014, title = {Would a Privacy Fundamentalist Sell their DNA for \$1000... if Nothing Bad Happened Thereafter? A Study of the Western Categories, Behavior Intentions, and Consequences}, booktitle = {Proceedings of the Tenth Symposium on Usable Privacy and Security (SOUPS)}, year = {2014}, note = {IAPP SOUPS Privacy Award Winner}, publisher = {ACM}, organization = {ACM}, address = {New York, NY}, url = {https://www.usenix.org/conference/soups2014/proceedings/presentation/woodruff}, author = {Woodruff, A. and Pihur, V. and Acquisti, A. and Consolvo, S. and Schmidt, L. and Brandimarte, L.} } @article {Hu13, title = {Are independent parameter draws necessary for multiple imputation?}, journal = {The American Statistician}, volume = {67}, year = {2013}, pages = {143-149}, doi = {10.1080/00031305.2013.821953}, url = {http://www.tandfonline.com/doi/full/10.1080/00031305.2013.821953}, author = {Hu, J. and Mitra, R. and Reiter, J.P.} } @booklet {Holan2013f, title = {A Bayesian Approach to Estimating Agricultural Yield Based on Multiple Repeated Surveys, Institute of Public Policy and the Truman School of Public Affairs}, year = {2013}, month = {March}, author = {Holan, S.H.} } @techreport {2653, title = {A Bayesian Approach to Graphical Record Linkage and De-duplication}, number = {1312.4645}, year = {2013}, abstract = {We propose an unsupervised approach for linking records across arbitrarily many files, while simultaneously detecting duplicate records within files. Our key innovation involves the representation of the pattern of links between records as a bipartite graph, in which records are directly linked to latent true individuals, and only indirectly linked to other records. This flexible representation of the linkage structure naturally allows us to estimate the attributes of the unique observable people in the population, calculate transitive linkage probabilities across records (and represent this visually), and propagate the uncertainty of record linkage into later analyses. Our method makes it particularly easy to integrate record linkage with post-processing procedures such as logistic regression, capture{\textendash}recapture, etc. Our linkage structure lends itself to an efficient, linear-time, hybrid Markov chain Monte Carlo algorithm, which overcomes many obstacles encountered by previously record linkage approaches, despite the high-dimensional parameter space. We illustrate our method using longitudinal data from the National Long Term Care Survey and with data from the Italian Survey on Household and Wealth, where we assess the accuracy of our method and show it to be better in terms of error rates and empirical scalability than other approaches in the literature. Supplementary materials for this article are available online.}, url = {https://arxiv.org/abs/1312.4645}, author = {Steorts, Rebecca C. and Hall, Rob and Fienberg, Stephen E.} } @booklet {Cressie2013a, title = {Bayesian inference for the Spatial Random Effects Model}, journal = {Department of Statistics, Macquarie University}, year = {2013}, month = {July}, publisher = {Macquarie University}, author = {Cressie, N.} } @conference {BanerjeeAISTAT, title = {Bayesian learning of joint distributions of objects}, booktitle = {Proceedings of the 16th International Conference on Artificial Intelligence and Statistics (AISTATS) 2013}, year = {2013}, abstract = {

There is increasing interest in broad application areas in defining flexible joint models for data having a variety of measurement scales, while also allowing data of complex types, such as functions, images and documents. We consider a general framework for nonparametric Bayes joint modeling through mixture models that incorporate dependence across data types through a joint mixing measure. The mixing measure is assigned a novel infinite tensor factorization (ITF) prior that allows flexible dependence in cluster allocation across data types. The ITF prior is formulated as a tensor product of stick-breaking processes. Focusing on a convenient special case corresponding to a Parafac factorization, we provide basic theory justifying the flexibility of the proposed prior and resulting asymptotic properties. Focusing on ITF mixtures of product kernels, we develop a new Gibbs sampling algorithm for routine implementation relying on slice sampling. The methods are compared with alternative joint mixture models based on Dirichlet processes and related approaches through simulations and real data applications.

Also at\ http://arxiv.org/abs/1303.0449

}, url = {http://jmlr.csail.mit.edu/proceedings/papers/v31/banerjee13a.html}, author = {Banerjee, A. and Murray, J. and Dunson, D. B.} } @conference {Wu2013b, title = {Bayesian Modeling in the Era of Big Data: the Role of High-Throughput and High-Performance Computing}, booktitle = {The Extreme Science and Engineering Discovery Environment Conference}, year = {2013}, month = {July}, address = {San Diego, CA}, author = {Wu, G.} } @techreport {handle:1813:34889, title = {Bayesian multiple imputation for large-scale categorical data with structural zeros}, number = {1813:34889}, year = {2013}, institution = {Duke University / National Institute of Statistical Sciences (NISS)}, type = {Preprint}, abstract = {Bayesian multiple imputation for large-scale categorical data with structural zeros Manrique-Vallier, D.; Reiter, J. P. We propose an approach for multiple imputation of items missing at random in large-scale surveys with exclusively categorical variables that have structural zeros. Our approach is to use mixtures of multinomial distributions as imputation engines, accounting for structural zeros by conceiving of the observed data as a truncated sample from a hypothetical population without structural zeros. This approach has several appealing features: imputations are generated from coherent, Bayesian joint models that automatically capture complex dependencies and readily scale to large numbers of variables. We outline a Gibbs sampling algorithm for implementing the approach, and we illustrate its potential with a repeated sampling study using public use census microdata from the state of New York, USA.}, url = {http://hdl.handle.net/1813/34889}, author = {Manrique-Vallier, D. and Reiter, J. P.} } @techreport {handle:1813:37986, title = {b-Bit Minwise Hashing in Practice}, number = {1813:37986}, year = {2013}, institution = {Cornell University}, type = {Preprint}, abstract = {b-Bit Minwise Hashing in Practice Li, Ping; Shrivastava, Anshumali; K{\"o}nig, Arnd Christian Minwise hashing is a standard technique in the context of search for approximating set similarities. The recent work [26, 32] demon- strated a potential use of b-bit minwise hashing [23, 24] for ef- ficient search and learning on massive, high-dimensional, binary data (which are typical for many applications in Web search and text mining). In this paper, we focus on a number of critical is- sues which must be addressed before one can apply b-bit minwise hashing to the volumes of data often used industrial applications.}, url = {http://hdl.handle.net/1813/37986}, author = {Li, Ping and Shrivastava, Anshumali and K{\"o}nig, Arnd Christian} } @conference {PingShrivastava2013, title = {b-Bit Minwise Hashing in Practice}, booktitle = {Internetware{\textquoteright}13}, year = {2013}, month = {October}, abstract = {Minwise hashing is a standard technique in the context of search for approximating set similarities. The recent work [26, 32] demonstrated a potential use of b-bit minwise hashing [23, 24] for efficient search and learning on massive, high-dimensional, binary data (which are typical for many applications in Web search and text mining). In this paper, we focus on a number of critical issues which must be addressed before one can apply b-bit minwise hashing to the volumes of data often used industrial applications. Minwise hashing requires an expensive preprocessing step that computes k (e.g., 500) minimal values after applying the corresponding permutations for each data vector. We developed a parallelization scheme using GPUs and observed that the preprocessing time can be reduced by a factor of 20 ~ 80 and becomes substantially smaller than the data loading time. Reducing the preprocessing time is highly beneficial in practice, e.g., for duplicate Web page detection (where minwise hashing is a major step in the crawling pipeline) or for increasing the testing speed of online classifiers. Another critical issue is that for very large data sets it becomes impossible to store a (fully) random permutation matrix, due to its space requirements. Our paper is the first study to demonstrate that b-bit minwise hashing implemented using simple hash functions, e.g., the 2-universal (2U) and 4-universal (4U) hash families, can produce very similar learning results as using fully random permutations. Experiments on datasets of up to 200GB are presented.}, url = {http://www.nudt.edu.cn/internetware2013/}, author = {Ping Li and Anshumali Shrivastava and K{\"o}nig, Arnd Christian} } @conference {ShrivastavaLi2013a, title = {Beyond Pairwise: Provably Fast Algorithms for Approximate K-Way Similarity Search}, booktitle = {Neural Information Processing Systems (NIPS)}, year = {2013}, author = {Anshumali Shrivastava and Ping Li} } @conference {Wu2013a, title = {Binomial Mixture Models for Urban Ecological Monitoring Studies Using American Community Survey Demographic Covariates}, booktitle = {Joint Statistical Meetings 2013}, year = {2013}, month = {August}, address = {Montreal, Canada}, author = {Wu, G.} } @conference {Spielman2013, title = {The Co-Evolution of Residential Segregation and the Built Environment at the Turn of the 20th Century: A Schelling Model}, booktitle = {Transactions in GIS}, year = {2013}, doi = {10.1111/tgis.12014}, author = {S.E. Spielman and Patrick Harrison} } @inbook {Olson2013, title = {Collecting paradata for measurement error evaluation}, booktitle = {Improving Surveys with Paradata: Analytic Uses of Process Information}, year = {2013}, pages = {43-72}, publisher = {John Wiley and Sons}, organization = {John Wiley and Sons}, chapter = {Collecting paradata for measurement error evaluation}, address = {Hoboken, NJ.}, doi = {10.1002/9781118596869.ch3}, author = {Olson, K. and Parkhurst, B.}, editor = {Frauke Kreuter} } @article {fie:2013, title = {Comment: Innovations Associated with Multiple Systems Estimation in Human Rights Settings}, journal = {The American Statistician}, volume = {67}, number = {4}, year = {2013}, author = {Fienberg, S. E.} } @conference {Cressie2013d, title = {Comparing and Selecting Predictors Predictors Using Local Criteria}, booktitle = {International Workshop on Recent Advances in Statistical Inference: Theory and Case Studies}, year = {2013}, month = {March}, publisher = {International Workshop on Recent Advances in Statistical Inference: Theory and Case Studies}, organization = {International Workshop on Recent Advances in Statistical Inference: Theory and Case Studies}, address = {Padua, Italy}, author = {Cressie, N.} } @conference {acq:2013, title = {Complementary Perspectives on Privacy and Security: Economics}, booktitle = {IEEE Security \& Privacy}, volume = {11}, number = {1}, year = {2013}, note = {Invited paper}, pages = {93{\textendash}95}, doi = {10.1109/MSP.2013.30}, author = {Acquisti, A.} } @techreport {handle:1813:34447, title = {Credible interval estimates for official statistics with survey nonresponse}, number = {1813:34447}, year = {2013}, institution = {Northwestern University}, type = {Preprint}, abstract = {Credible interval estimates for official statistics with survey nonresponse Manski, Charles F. Government agencies commonly report official statistics based on survey data as point estimates, without accompanying measures of error. In the absence of agency guidance, users of the statistics can only conjecture the error magnitudes. Agencies could mitigate misinterpretation of official statistics if they were to measure potential errors and report them. Agencies could report sampling error using established statistical principles. It is more challenging to report nonsampling errors because there are many sources of such errors and there has been no consensus about how to measure them. To advance discourse on practical ways to report nonsampling error, this paper considers error due to survey nonresponse. I summarize research deriving interval estimates that make no assumptions about the values of missing data. In the absence of assumptions, one can obtain computable bounds on the population parameters that official statistics intend to measure. I also explore the middle ground between interval estimation making no assumptions and traditional point estimation using weights and imputations to implement assumptions that nonresponse is conditionally random. I am grateful to Aanchal Jain for excellent research assistance and to Bruce Spencer for helpful discussions. I have benefitted from the opportunity to present this work in a seminar at the Institute for Social and Economic Research, University of Essex.}, url = {http://hdl.handle.net/1813/34447}, author = {Manski, Charles F.} } @article {DBLP:journals/ijdc/LagozeBWAV13, title = {Data Management of Confidential Data}, journal = {International Journal of Digital Curation}, volume = {8}, number = {1}, year = {2013}, note = {Presented at 8th International Digital Curation Conference 2013, Amsterdam. See also http://hdl.handle.net/1813/30924}, pages = {265-278}, abstract = {Social science researchers increasingly make use of data that is confidential because it contains linkages to the identities of people, corporations, etc. The value of this data lies in the ability to join the identifiable entities with external data such as genome data, geospatial information, and the like. However, the confidentiality of this data is a barrier to its utility and curation, making it difficult to fulfill US federal data management mandates and interfering with basic scholarly practices such as validation and reuse of existing results. We describe the complexity of the relationships among data that span a public and private divide. We then describe our work on the CED2AR prototype, a first step in providing researchers with a tool that spans this divide and makes it possible for them to search, access, and cite that data.}, doi = {10.2218/ijdc.v8i1.259}, author = {Carl Lagoze and William C. Block and Jeremy Williams and John M. Abowd and Lars Vilhuber} } @conference {2167, title = {Do {\textquoteleft}Don{\textquoteright}t Know{\textquoteright} Responses = Survey Satisficing? Evidence from the Gallup Panel Paradata}, booktitle = {American Association for Public Opinion Research 2013 Annual Conference}, year = {2013}, address = {Boston, MA}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Wang, Mengyang and Ruppanner, Leah and McCutcheon, Allan L.} } @article {Shaefer2013, title = {Do single mothers in the United States use the Earned Income Tax Credit to reduce unsecured debt?}, journal = {Review of Economics of the Household}, number = {11}, year = {2013}, note = {NCRN}, pages = {659{\textendash}680}, type = {Journal Article}, abstract = {

The Earned Income Tax Credit (EITC) is a refundable credit for low income workers mainly targeted at families with children. This study uses the Survey of Income and Program Participation{\textquoteright}s topical modules on Assets and Liabilities to examine associations between the EITC expansions during the early 1990s and the unsecured debt of the households of single mothers. We use two difference-in-differences comparisons over the study period 1988{\textendash}1999, first comparing single mothers to single childless women, and then comparing single mothers with two or more children to single mothers with exactly one child. In both cases we find that the EITC expansions are associated with a relative decline in the unsecured debt of affected households of single mothers. While not direct evidence of a causal relationship, this is suggestive evidence that single mothers may have used part of their EITC to limit the growth of their unsecured debt during this period.

}, keywords = {Earned Income Tax Credit Single Mothers Unsecured Debt}, author = {Shaefer, H. Luke and Song, Xiaoqing and Williams Shanks, Trina R.} } @conference {Wikle2013b, title = {Ecological Prediction with Nonlinear Multivariate Time-Frequency Functional Data Models}, booktitle = {Joint Statistical Meetings 2013}, year = {2013}, month = {August}, address = {Montreal, Canada}, author = {Wikle, C.K.} } @article {Holan2014c, title = {Ecological Prediction With Nonlinear Multivariate Time-Frequency Functional Data Models}, journal = {Journal of Agricultural, Biological, and Environmental Statistics}, volume = {18}, year = {2013}, chapter = {450-474}, doi = {10.1007/s13253-013-0142-1}, url = {http://link.springer.com/article/10.1007/s13253-013-0142-1}, author = {Yang, W.H., and Wikle, C.K. and Holan, S.H. and Wildhaber, M.L.} } @article {rom:hof:acq:2013, title = {Empirical Analysis of Data Breach Litigation}, journal = {Journal of Empirical Legal Studies}, volume = {11}, number = {1}, year = {2013}, pages = {74{\textendash}104}, author = {Romanosky, A. and Hoffman, D. and Acquisti, A.} } @conference {LagozeEtAl2013b, title = {Encoding Provenance Metadata for Social Science Datasets}, booktitle = {Metadata and Semantics Research}, series = {Communications in Computer and Information Science}, volume = {390}, year = {2013}, pages = {123-134}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, keywords = {DDI, eSocial Science, Metadata, Provenance}, isbn = {978-3-319-03436-2}, doi = {10.1007/978-3-319-03437-9_13}, url = {http://dx.doi.org/10.1007/978-3-319-03437-9_13}, author = {Lagoze, Carl and Willliams, Jeremy and Vilhuber, Lars}, editor = {Garoufallou, Emmanouel and Greenberg, Jane} } @techreport {handle:1813:34443, title = {Encoding Provenance of Social Science Data: Integrating PROV with DDI}, number = {1813:34443}, year = {2013}, institution = {Cornell University}, type = {Preprint}, abstract = {Encoding Provenance of Social Science Data: Integrating PROV with DDI Lagoze, Carl; Block, William C; Williams, Jeremy; Abowd, John; Vilhuber, Lars Provenance is a key component of evaluating the integrity and reusability of data for scholarship. While recording and providing access provenance has always been important, it is even more critical in the web environment in which data from distributed sources and of varying integrity can be combined and derived. The PROV model, developed under the auspices of the W3C, is a foundation for semantically-rich, interoperable, and web-compatible provenance metadata. We report on the results of our experimentation with integrating the PROV model into the DDI metadata for a complex, but characteristic, example social science data. We also present some preliminary thinking on how to visualize those graphs in the user interface. Submitted to EDDI13 5th Annual European DDI User Conference December 2013, Paris, France}, url = {http://hdl.handle.net/1813/34443}, author = {Lagoze, Carl and Block, William C and Williams, Jeremy and Abowd, John and Vilhuber, Lars} } @conference {LagozeEtAl2013, title = {Encoding Provenance of Social Science Data: Integrating PROV with DDI}, booktitle = {5th Annual European DDI User Conference}, year = {2013}, abstract = {Provenance is a key component of evaluating the integrity and reusability of data for scholarship. While recording and providing access provenance has always been important, it is even more critical in the web environment in which data from distributed sources and of varying integrity can be combined and derived. The PROV model, developed under the auspices of the W3C, is a foundation for semantically-rich, interoperable, and web-compatible provenance metadata. We report on the results of our experimentation with integrating the PROV model into the DDI metadata for a complex, but characteristic, example social science data. We also present some preliminary thinking on how to visualize those graphs in the user interface.}, keywords = {DDI, eSocial Science, Metadata, Provenance}, author = {Carl Lagoze and William C. Block and Jeremy Williams and Lars Vilhuber} } @article {RebeccaC.Steorts2013, title = {On estimation of mean squared errors of benchmarked and empirical bayes estimators}, journal = {Statistica Sinica}, volume = {23}, year = {2013}, pages = {749{\textendash}767}, author = {Rebecca C. Steorts and Malay Ghosh} } @conference {LiZhang2013a, title = {Exact Sparse Recovery with L0 Projections}, booktitle = {19th ACM SIGKDD Conference on Knowledge Discovery and Data Mining}, year = {2013}, month = {August}, author = {Ping Li and Cun-Hui Zhang} } @conference {2143, title = {Examining item nonresponse through paradata and respondent characteristics: A multilevel approach}, booktitle = {American Association for Public Opinion Research 2013 Annual Conference}, year = {2013}, address = {Boston, MA}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Cordova-Cazar, A.L.} } @conference {2152, title = {Examining response time outliers through paradata in Online Panel Surveys}, booktitle = {American Association for Public Opinion Research 2013 Annual Conference}, year = {2013}, address = {Boston, MA}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Lee, J. and T. Al Baghal} } @conference {2163, title = {Examining the relationship between error and behavior in the American Time Use Survey using audit trail paradata}, booktitle = {American Association for Public Opinion Research 2013 Annual Conference}, year = {2013}, address = {Boston, MA}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Ruther, N. and T. Al Baghal and A. Eck and L. Stuart and L. Phillips and R. Belli and Soh, L-K} } @techreport {handle:1813:37987, title = {Fast Near Neighbor Search in High-Dimensional Binary Data}, number = {1813:37987}, year = {2013}, institution = {Cornell University}, type = {Preprint}, abstract = {Fast Near Neighbor Search in High-Dimensional Binary Data Shrivastava, Anshumali; Li, Ping Numerous applications in search, databases, machine learning, and computer vision, can benefit from efficient algorithms for near neighbor search. This paper proposes a simple framework for fast near neighbor search in high-dimensional binary data, which are common in practice (e.g., text). We develop a very simple and effective strategy for sub-linear time near neighbor search, by creating hash tables directly using the bits generated by b-bit minwise hashing. The advantages of our method are demonstrated through thorough comparisons with two strong baselines: spectral hashing and sign (1-bit) random projections.}, url = {http://hdl.handle.net/1813/37987}, author = {Shrivastava, Anshumali and Li, Ping} } @conference {Porter2013, title = {Flexible Semiparametric Hierarchical Spatial Models}, booktitle = {Joint Statistical Meetings 2013}, year = {2013}, month = {August}, address = {Montreal, Canada}, author = {Porter, A.T.} } @article {wan:leo:chen:2013, title = {From Facebook Regrets to Facebook Privacy Nudges}, journal = {Ohio State Law Journal}, year = {2013}, note = {Invited paper}, author = {Wang, Y. and Leon, P. G. and Chen, X. and Komanduri, S. and Norcie, G. and Scott, K. and Acquisti, A. and Cranor, L. F. and Sadeh, N.} } @article {sad:fie:2013, title = {A Generalized Fellegi-Sunter Framework for Multiple Record Linkage with Application to Homicide Record Systems}, journal = {Journal of the American Statistical Association}, volume = {108}, number = {502}, year = {2013}, pages = {385{\textendash}397}, doi = {10.1080/01621459.2012.757231}, url = {http://dx.doi.org/10.1080/01621459.2012.757231}, author = {Sadinle, M. and Fienberg, S. E.} } @article {acq:adj:bra:2013, title = {Gone in 15 Seconds: The Limits of Privacy Transparency and Control}, journal = {IEEE Security \& Privacy}, volume = {11}, number = {4}, year = {2013}, pages = {72{\textendash}74}, author = {Acquisti, A. and Adjerid, I. and Brandimarte, L.} } @article {deng2013, title = {Handling Attrition in Longitudinal Studies: The Case for Refreshment Samples}, journal = {Statist. Sci.}, volume = {28}, year = {2013}, month = {05/2013}, pages = {238{\textendash}256}, chapter = {238}, abstract = {Panel studies typically suffer from attrition, which reduces sample size and can result in biased inferences. It is impossible to know whether or not the attrition causes bias from the observed panel data alone. Refreshment samples{\textemdash}new, randomly sampled respondents given the questionnaire at the same time as a subsequent wave of the panel{\textemdash}offer information that can be used to diagnose and adjust for bias due to attrition. We review and bolster the case for the use of refreshment samples in panel studies. We include examples of both a fully Bayesian approach for analyzing the concatenated panel and refreshment data, and a multiple imputation approach for analyzing only the original panel. For the latter, we document a positive bias in the usual multiple imputation variance estimator. We present models appropriate for three waves and two refreshment samples, including nonterminal attrition. We illustrate the three-wave analysis using the 2007{\textendash}2008 Associated Press{\textendash}Yahoo! News Election Poll.}, doi = {10.1214/13-STS414}, url = {http://dx.doi.org/10.1214/13-STS414}, author = {Deng, Yiting and Hillygus, D. Sunshine and Reiter, Jerome P. and Si, Yajuan and Zheng, Siyu} } @article {Wikle2013d, title = {Hierarchical Bayesian Spatio-Temporal Conway-Maxwell Poisson Models with Dynamic Dispersion}, journal = {Journal of Agricultural, Biological, and Environmental Statistics}, volume = {18}, year = {2013}, pages = {335-356}, address = {Anchorage, Alaska}, doi = {10.1007/s13253-013-0141-2}, url = {http://link.springer.com/article/10.1007/s13253-013-0141-2}, author = {Wu, G. and Holan, S.H. and Wikle, C.K.} } @article {Wikle2013, title = {Hierarchical Spatio-Temporal Models and Survey Research}, journal = {Statistics Views}, year = {2013}, month = {May}, url = {http://www.statisticsviews.com/details/feature/4730991/Hierarchical-Spatio-Temporal-Models-and-Survey-Research.html}, author = {Wikle, C. and Holan, S. and Cressie, N.} } @article {Sengupta2013, title = {Hierarchical Statistical Modeling of Big Spatial Datasets Using the Exponential Family of Distributions}, journal = {Spatial Statistics}, volume = {4}, year = {2013}, pages = {14-44}, keywords = {EM algorithm, Empirical Bayes, Geostatistical process, Maximum likelihood estimation, MCMC, SRE model}, doi = {10.1016/j.spasta.2013.02.002}, url = {http://www.sciencedirect.com/science/article/pii/S2211675313000055}, author = {Sengupta, A. and Cressie, N.} } @booklet {Cressie2013, title = {How can survey estimates of small areas be improved by leveraging social-media data?}, journal = {The Survey Statistician}, number = {68}, year = {2013}, month = {July}, url = {http://isi.cbs.nl/iass/N68.pdf}, author = {Cressie, N. and Holan, S. and Wikle, C.} } @article {Spielman2013a, title = {Identifying Neighborhoods Using High Resolution Population Data}, journal = {Annals of the Association of American Geographers}, volume = {103}, year = {2013}, pages = {67-84}, author = {S.E. Spielman and J. Logan} } @techreport {handle:1813:33362, title = {Improving User Access to Metadata for Public and Restricted Use US Federal Statistical Files}, number = {1813:33362}, year = {2013}, institution = {Cornell University}, type = {Preprint}, abstract = {Improving User Access to Metadata for Public and Restricted Use US Federal Statistical Files Block, William C.; Williams, Jeremy; Vilhuber, Lars; Lagoze, Carl; Brown, Warren; Abowd, John M. Presentation at NADDI 2013 This record has also been archived at http://kuscholarworks.ku.edu/dspace/handle/1808/11093 .}, url = {http://hdl.handle.net/1813/33362}, author = {Block, William C. and Williams, Jeremy and Vilhuber, Lars and Lagoze, Carl and Brown, Warren and Abowd, John M.} } @conference {bal:pee:bra:2013, title = {Is it the Typeset or the Type of Statistics? Disfluent Font and Self-Disclosure}, booktitle = {Proceedings of Learning from Authoritative Security Experiment Results (LASER)}, year = {2013}, publisher = {USENIX Association}, organization = {USENIX Association}, address = {New York, NY}, url = {https://www.usenix.org/laser2013/program/balebako}, author = {Balebako, R. and Pe{\textquoteright}er, E. and Brandimarte, L. and Cranor, L. F. and Acquisti, A.} } @techreport {handle:1813:34534, title = {Managing Confidentiality and Provenance across Mixed Private and Publicly-Accessed Data and Metadata}, number = {1813:34534}, year = {2013}, institution = {Cornell University}, type = {Preprint}, abstract = {Managing Confidentiality and Provenance across Mixed Private and Publicly-Accessed Data and Metadata Vilhuber, Lars; Abowd, John; Block, William; Lagoze, Carl; Williams, Jeremy Social science researchers are increasingly interested in making use of confidential micro-data that contains linkages to the identities of people, corporations, etc. The value of this linking lies in the potential to join these identifiable entities with external data such as genome data, geospatial information, and the like. Leveraging these linkages is an essential aspect of {\textquotedblleft}big data{\textquotedblright} scholarship. However, the utility of these confidential data for scholarship is compromised by the complex nature of their management and curation. This makes it difficult to fulfill US federal data management mandates and interferes with basic scholarly practices such as validation and reuse of existing results. We describe in this paper our work on the CED2AR prototype, a first step in providing researchers with a tool that spans the confidential/publicly-accessible divide, making it possible for researchers to identify, search, access, and cite those data. The particular points of interest in our work are the cloaking of metadata fields and the expression of provenance chains. For the former, we make use of existing fields in the DDI (Data Description Initiative) specification and suggest some minor changes to the specification. For the latter problem, we investigate the integration of DDI with recent work by the W3C PROV working group that has developed a generalizable and extensible model for expressing data provenance.}, url = {http://hdl.handle.net/1813/34534}, author = {Vilhuber, Lars and Abowd, John and Block, William and Lagoze, Carl and Williams, Jeremy} } @article {Belli2013, title = {Memory, communication, and data quality in calendar interviews}, journal = {Public Opinion Quarterly}, volume = {77}, year = {2013}, pages = {194-219}, author = {Belli, R. F., and Bilgen, I., and T. Al Baghal} } @mastersthesis {2412, title = {Mental Disorders and Inequality in the United States: Intersection of race, gender, and disability on employment and income}, volume = {Ph.D. }, year = {2013}, school = {Wayne State University}, author = {Camp, J.} } @article {LauraBrandimarte2013, title = {Misplaced confidences: Privacy and the control paradox}, journal = {Social Psychological and Personality Science}, volume = {4}, number = {3}, year = {2013}, pages = {340{\textendash}347}, doi = {10.1177/1948550612455931}, author = {Laura Brandimarte and Alessandro Acquisti and George Loewenstein} } @techreport {handle:1813:45870, title = {NCRN Meeting Spring 2013}, number = {1813:45870}, year = {2013}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2013 Vilhuber, Lars Taken place at the NISS Headquarters, Research Triangle Park, NC.}, url = {http://hdl.handle.net/1813/45870}, author = {Vilhuber, Lars} } @techreport {handle:1813:40232, title = {NCRN Newsletter: Volume 1 - Issue 1}, number = {1813:40232}, year = {2013}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Newsletter: Volume 1 - Issue 1 Vilhuber, Lars; Karr, Alan; Reiter, Jerome; Abowd, John; Nunnelly, Jamie Overview of activities at NSF-Census Research Network nodes from July 2013 to November 2013. NCRN Newsletter Vol. 1, Issue 1: November 17, 2013}, url = {http://hdl.handle.net/1813/40232}, author = {Vilhuber, Lars and Karr, Alan and Reiter, Jerome and Abowd, John and Nunnelly, Jamie} } @article {spielman2013EPB, title = {Neighborhood contexts, health, and behavior: understanding the role of scale and residential sorting}, journal = {Environment and Planning B}, volume = {3}, year = {2013}, author = {Spielman, S. E. and Linkletter, C. and Yoo, E.-H.} } @conference {Wikle2013e, title = {Nonlinear Dynamic Spatio-Temporal Statistical Models}, booktitle = {Southern Regional Council on Statistics Summer Research Conference}, year = {2013}, month = {June}, author = {Wikle, C.K.} } @article {Si2013, title = {Nonparametric Bayesian multiple imputation for incomplete categorical variables in large-scale assessment surveys}, journal = {Journal of Educational and Behavioral Statistics}, volume = {38}, year = {2013}, pages = {499-521}, url = {http://www.stat.duke.edu/~jerry/Papers/StatinMed14.pdf}, author = {Si, Y. and Reiter, J.P.} } @conference {2160, title = {Paradata for Measurement Error Evaluation}, booktitle = {American Association for Public Opinion Research 2013 Annual Conference}, year = {2013}, address = {Boston, MA}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Olson, K.} } @conference {2154, title = {Predicting survey breakoff in Internet survey panels}, booktitle = {American Association for Public Opinion Research 2013 Annual Conference}, year = {2013}, address = {Boston, MA}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {McCutcheon, A.L. and T. Al Baghal} } @conference {2141, title = {Predicting the occurrence of respondent retrieval strategies in calendar interviewing: The quality of autobiographical recall in surveys}, booktitle = {Biennial conference of the Society for Applied Research in Memory and Cognition}, year = {2013}, address = {Rotterdam, Netherlands}, url = {http://static1.squarespace.com/static/504170d6e4b0b97fe5a59760/t/52457a8be4b0012b7a5f462a/1380285067247/SARMAC_X_PaperJune27.pdf}, author = {Belli, R.F. and Miller, L.D. and Soh, L-K and T. Al Baghal} } @conference {2140, title = {Predicting the occurrence of respondent retrieval strategies in calendar interviewing: The quality of retrospective reports}, booktitle = {American Association for Public Opinion Research 2013 Annual Conference}, year = {2013}, address = {Boston, MA}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Belli, R.F. and Miller, L.D. and Soh, L-K and T. Al Baghal} } @techreport {handle:1813:40255, title = {Presentation: Predicting Multiple Responses with Boosting and Trees}, number = {1813:40255}, year = {2013}, institution = {Cornell University}, type = {Preprint}, abstract = {Presentation: Predicting Multiple Responses with Boosting and Trees Li, Ping; Abowd, John Presentation by Ping Li and John Abowd at FCSM on November 4, 2013}, url = {http://hdl.handle.net/1813/40255}, author = {Li, Ping and Abowd, John} } @conference {2164, title = {The process of turning audit trails from a CATI survey into useful data: Interviewer behavior paradata in the American Time Use Survey}, booktitle = {American Association for Public Opinion Research 2013 Annual Conference}, year = {2013}, address = {Boston, MA}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Ruther, N. and Phipps, P. and Belli, R.F.} } @booklet {Holan2013b, title = {Recent Advances in Spatial Methods for Federal Surveys}, year = {2013}, month = {September}, author = {Holan, S.H.} } @techreport {2413, title = {Reconsidering the Consequences of Worker Displacements: Survey versus Administrative Measurements}, year = {2013}, institution = {University of Michigan}, type = {mimeo}, abstract = {Displaced workers suffer persistent earnings losses. This stark finding has been established by following workers in administrative data after mass layoffs under the presumption that these are involuntary job losses owing to economic distress. Using linked survey and administrative data, this paper examines this presumption by matching worker-supplied reasons for separations with what is happening at the firm. The paper documents substantially different earnings dynamics in mass layoffs depending on the reason the worker gives for the separation. Using a new methodology for accounting for the increase in the probability of separation among all types of survey response during in a mass layoff, the paper finds earnings loss estimates that are surprisingly close to those using only administrative data. Finally, the survey-administrative link allows the decomposition of earnings losses due to subsequent nonemployment into non-participation and unemployment. Including the zero earnings of those identified as being unemployed substantially increases the estimate of earnings losses.}, url = {http://www-personal.umich.edu/~shapiro/papers/ReconsideringDisplacements.pdf}, author = {Flaaen, Aaron and Shapiro, Matthew and Isaac Sorkin} } @booklet {Bradley2013, title = {A Reduced Rank Model for Analyzing Multivariate Spatial Datasets}, journal = {University of Missouri-Kansas City}, year = {2013}, month = {November}, publisher = {University of Missouri-Kansas City}, author = {Bradley, J.R.} } @article {2261, title = {Ringtail: a generalized nowcasting system.}, journal = {WebDB}, volume = {6}, year = {2013}, pages = {1358-1361}, chapter = {1358}, abstract = {Social media nowcasting{\textemdash}using online user activity to de- scribe real-world phenomena{\textemdash}is an active area of research to supplement more traditional and costly data collection methods such as phone surveys. Given the potential impact of such research, we would expect general-purpose nowcast- ing systems to quickly become a standard tool among non- computer scientists, yet it has largely remained a research topic. We believe a major obstacle to widespread adoption is the nowcasting feature selection problem. Typical now- casting systems require the user to choose a handful of social media objects from a pool of billions of potential candidates, which can be a time-consuming and error-prone process. We have built Ringtail, a nowcasting system that helps the user by automatically suggesting high-quality signals. We demonstrate that Ringtail can make nowcasting easier by suggesting relevant features for a range of topics. The user provides just a short topic query (e.g., unemployment) and a small conventional dataset in order for Ringtail to quickly return a usable predictive nowcasting model.}, url = {http://cs.stanford.edu/people/chrismre/papers/Ringtail-VLDB-demo.pdf}, author = {Antenucci, Dolan and Li, Erdong and Liu, Shaobo and Zhang, Bochun and Cafarella, Michael J and R{\'e}, Christopher} } @article {2262, title = {Ringtail: Feature Selection for Easier Nowcasting.}, journal = {WebDB}, year = {2013}, pages = {49-54}, chapter = {49}, abstract = {In recent years, social media {\textquotedblleft}nowcasting{\textquotedblright}{\textemdash}the use of on- line user activity to predict various ongoing real-world social phenomena{\textemdash}has become a popular research topic; yet, this popularity has not led to widespread actual practice. We be- lieve a major obstacle to widespread adoption is the feature selection problem. Typical nowcasting systems require the user to choose a set of relevant social media objects, which is difficult, time-consuming, and can imply a statistical back- ground that users may not have. We propose Ringtail, which helps the user choose rele- vant social media signals. It takes a single user input string (e.g., unemployment) and yields a number of relevant signals the user can use to build a nowcasting model. We evaluate Ringtail on six different topics using a corpus of almost 6 billion tweets, showing that features chosen by Ringtail in a wholly-automated way are better or as good as those from a human and substantially better if Ringtail receives some human assistance. In all cases, Ringtail reduces the burden on the user.}, url = {http://www.cs.stanford.edu/people/chrismre/papers/webdb_ringtail.pdf}, author = {Antenucci, Dolan and Cafarella, Michael J and Levenstein, Margaret C. and R{\'e}, Christopher and Shapiro, Matthew} } @article {2259, title = {Rising extreme poverty in the United States and the response of means-tested transfers.}, journal = {Social Service Review }, volume = {87}, year = {2013}, month = {06/2013}, pages = {250-268}, chapter = {250}, abstract = {This study documents an increase in the prevalence of extreme poverty among US households with children between 1996 and 2011 and assesses the response of major federal means-tested transfer programs. Extreme poverty is defined using a World Bank metric of global poverty: \$2 or less, per person, per day. Using the 1996{\textendash}2008 panels of the Survey of Income and Program Participation (SIPP), we estimate that in mid-2011, 1.65 million households with 3.55 million children were living in extreme poverty in a given month, based on cash income, constituting 4.3 percent of all nonelderly households with children. The prevalence of extreme poverty has risen sharply since 1996, particularly among those most affected by the 1996 welfare reform. Adding SNAP benefits to household income reduces the number of extremely poor households with children by 48.0 percent in mid-2011. Adding SNAP, refundable tax credits, and housing subsidies reduces it by 62.8 percent.}, doi = {10.1086/671012}, url = {http://www.jstor.org/stable/10.1086/671012}, author = {H. Luke Shaefer and Edin, K.} } @conference {adj:acq:low:2013, title = {Sleights of Privacy: Framing, Disclosures, and the Limits of Transparency}, booktitle = {Proceedings of the Ninth Symposium on Usable Privacy and Security (SOUPS)}, year = {2013}, publisher = {ACM}, organization = {ACM}, address = {New York, NY}, author = {Adjerid, I. and Acquisti, A. and Loewenstein, G.} } @booklet {Cressie2013b, title = {Some Historical Remarks on Spatial Statistics, Spatio-Temporal Statistics}, journal = {Reading Group, University of Missouri}, year = {2013}, month = {April}, author = {Cressie, N.} } @mastersthesis {1791, title = {Some Recent Advances in Non- and Semiparametric Bayesian Modeling with Copulas, Mixtures, and Latent Variables (Ph.D. Thesis)}, year = {2013}, school = {Duke University}, type = {Ph.D.}, abstract = {This thesis develops flexible non- and semiparametric Bayesian models for mixed continuous, ordered and unordered categorical data. These methods have a range of possible applications; the applications considered in this thesis are drawn primarily from the social sciences, where multivariate, heterogeneous datasets with complex dependence and missing observations are the norm. The first contribution is an extension of the Gaussian factor model to Gaussian copula factor models, which accommodate continuous and ordinal data with unspecified marginal distributions. I describe how this model is the most natural extension of the Gaussian factor model, preserving its essential dependence structure and the interpretability of factor loadings and the latent variables. I adopt an approximate likelihood for posterior inference and prove that, if the Gaussian copula model is true, the approximate posterior distribution of the copula correlation matrix asymptotically converges to the correct parameter under nearly any marginal distributions. I demonstrate with simulations that this method is both robust and efficient, and illustrate its use in an application from political science. The second contribution is a novel nonparametric hierarchical mixture model for continuous, ordered and unordered categorical data. The model includes a hierarchical prior used to couple component indices of two separate models, which are also linked by local multivariate regressions. This structure effectively overcomes the limitations of existing mixture models for mixed data, namely the overly strong local independence assumptions. In the proposed model local independence is replaced by local conditional independence, so that the induced model is able to more readily adapt to structure in the data. I demonstrate the utility of this model as a default engine for multiple imputation of mixed data in a large repeated-sampling study using data from the Survey of Income and Participation. I show that it improves substantially on its most popular competitor, multiple imputation by chained equations (MICE), while enjoying certain theoretical properties that MICE lacks. The third contribution is a latent variable model for density regression. Most existing density regression models are quite flexible but somewhat cumbersome to specify and fit, particularly when the regressors are a combination of continuous and categorical variables. The majority of these methods rely on extensions of infinite discrete mixture models to incorporate covariate dependence in mixture weights, atoms or both. I take a fundamentally different approach, introducing a continuous latent variable which depends on covariates through a parametric regression. In turn, the observed response depends on the latent variable through an unknown function. I demonstrate that a spline prior for the unknown function is quite effective relative to Dirichlet Process mixture models in density estimation settings (i.e., without covariates) even though these Dirichlet process mixtures have better theoretical properties asymptotically. The spline formulation enjoys a number of computational advantages over more flexible priors on functions. Finally, I demonstrate the utility of this model in regression applications using a dataset on U.S. wages from the Census Bureau, where I estimate the return to schooling as a smooth function of the quantile index.}, url = {http://dukespace.lib.duke.edu/dspace/handle/10161/8253}, author = {Jared S. Murray} } @booklet {Porter2013a, title = {Spatial Fay-Herriot Models for Small Area Estimation with Functional Covariates}, year = {2013}, month = {May}, author = {Porter, A.T.} } @inbook {Holan2013, title = {Spatio-temporal Design: Advances in Efficient Data Acquisition}, booktitle = {Spatio-temporal Design: Advances in Efficient Data Acquisition}, year = {2013}, pages = {269-284}, publisher = {Wiley}, organization = {Wiley}, chapter = {Semiparametric Dynamic Design of Monitoring Networks for Non-Gaussian Spatio-Temporal Data}, keywords = {semiparametric dynamic design for non-Gaussian spatio-temporal data}, isbn = {9780470974292}, doi = {10.1002/9781118441862}, author = {Holan, S. and Wikle, C.}, editor = {Jorge Mateu and Werner Muller} } @booklet {Wikle2013c, title = {Statistics and the Environment: Overview and Challenges}, year = {2013}, note = {Invited Introductory Overview Lecture}, month = {May}, author = {Wikle, C.K.} } @booklet {Cressie2013c, title = {Statistics for Spatio-Temporal Data}, journal = {Invited One-Day Short Course at the U.S. Census Bureau}, year = {2013}, month = {April}, author = {Cressie, N.} } @conference {2161, title = {Troubles with time-use: Examining potential indicators of error in the American Time Use Survey}, booktitle = {American Association for Public Opinion Research 2013 Annual Conference}, year = {2013}, address = {Boston, MA}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Phillips, A.L. and T. Al Baghal and Belli, R.F.} } @article {1559, title = {Two-stage Bayesian benchmarking as applied to small area estimation}, journal = {TEST}, volume = {22}, year = {2013}, month = {2013}, chapter = {670}, keywords = {small area estimation}, author = {Rebecca C. Steorts and Malay Ghosh} } @mastersthesis {Stuart2013, title = {User Modeling via Machine Learning and Rule-based Reasoning to Understand and Predict Errors in Survey Systems}, year = {2013}, school = {University of Nebraska-Lincoln}, type = {Masters}, url = {http://digitalcommons.unl.edu/computerscidiss/70/}, author = {Stuart, Leonard Cleve} } @article {spielman2013using, title = {Using High Resolution Population Data to Identify Neighborhoods and Determine their Boundaries}, journal = {Annals of the Association of American Geographers}, volume = {103}, number = {1}, year = {2013}, pages = {67-84}, doi = {10.1080/00045608.2012.685049}, url = {http://www.tandfonline.com/doi/abs/10.1080/00045608.2012.685049}, author = {Spielman, S. E. and Logan, J.} } @mastersthesis {Wilson2013, title = {Using Satellite Imagery to Evaluate and Analyze Socioeconomic Changes Observed with Census Data}, year = {2013}, note = {NCRN}, type = {Ph.D.}, author = {Wilson, C. R.} } @conference {2134, title = {What are you doing now?: Audit trails, Activity level responses and error in the American Time Use Survey}, booktitle = {American Association for Public Opinion Research}, year = {2013}, address = {Boston, MA}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {T. Al Baghal and Phillips, A.L. and Ruther, N. and Belli, R.F. and Stuart, L. and Eck, A. and Soh, L-K} } @article {acq:joh:loe:2013, title = {What is Privacy Worth?}, journal = {Journal of Legal Studies}, volume = {42}, number = {2}, year = {2013}, note = {Leading paper, 2010 Future of Privacy Forum{\textquoteright}s Best {\textquoteleft}{\textquoteleft}Privacy Papers for Policy Makers{\textquoteright}{\textquoteright} Competition}, pages = {249{\textendash}274}, author = {Acquisti, A. and John, L. and Loewenstein, G.} } @article {YuvalNardi2012, title = {Achieving both valid and secure logistic regression analysis on aggregated data from different private sources}, journal = {Journal of Privacy and Confidentiality}, volume = {4}, year = {2012}, pages = {189}, author = {Yuval Nardi and Robert Hall and Stephen E. Fienberg} } @article {Holan2012, title = {An Approach for Identifying and Predicting Economic Recessions in Real-Time Using Time-Frequency Functional Models}, journal = {Applied Stochastic Models in Business and Industry}, volume = {28}, year = {2012}, note = {DOI: 10.1002/asmb.1954}, month = {12/2012}, pages = {485-499}, keywords = {Bayesian model averaging, business cycles, empirical orthogonal functions, functional data, MIDAS, spectrogram, stochastic search variable selection}, doi = {10.1002/asmb.1954}, url = {http://onlinelibrary.wiley.com/doi/10.1002/asmb.1954/full}, author = {Holan, S. and Yang, W. and Matteson, D. and Wikle, C.K.} } @booklet {McElroy2012, title = {Asymptotic Theory of Cepstral Random Fields}, year = {2012}, note = {Arxiv Preprint arXiv:1112.1977}, publisher = {University of Missouri}, author = {McElroy, T. and Holan, S.} } @techreport {handle:1813:34461, title = {Asymptotic Theory of Cepstral Random Fields}, number = {1813:34461}, year = {2012}, institution = {University of Missouri}, type = {Preprint}, abstract = {Asymptotic Theory of Cepstral Random Fields McElroy, T.S.; Holan, S.H. Random fields play a central role in the analysis of spatially correlated data and, as a result,have a significant impact on a broad array of scientific applications. Given the importance of this topic, there has been a substantial amount of research devoted to this area. However, the cepstral random field model remains largely underdeveloped outside the engineering literature. We provide a comprehensive treatment of the asymptotic theory for two-dimensional random field models. In particular, we provide recursive formulas that connect the spatial cepstral coefficients to an equivalent moving-average random field, which facilitates easy computation of the necessary autocovariance matrix. Additionally, we establish asymptotic consistency results for Bayesian, maximum likelihood, and quasi-maximum likelihood estimation of random field parameters and regression parameters. Further, in both the maximum and quasi-maximum likelihood frameworks, we derive the asymptotic distribution of our estimator. The theoretical results are presented generally and are of independent interest,pertaining to a wide class of random field models. The results for the cepstral model facilitate model-building: because the cepstral coefficients are unconstrained in practice, numerical optimization is greatly simplified, and we are always guaranteed a positive definite covariance matrix. We show that inference for individual coefficients is possible, and one can refine models in a disciplined manner. Finally, our results are illustrated through simulation and the analysis of straw yield data in an agricultural field experiment. http://arxiv.org/pdf/1112.1977.pdf}, url = {http://hdl.handle.net/1813/34461}, author = {McElroy, T.S. and Holan, S.H.} } @article {Wang2012, title = {Bayesian Multi-Regime Smooth Transition Regression with Ordered Categorical Variables}, journal = {Computational Statistics and Data Analysis}, volume = {56}, year = {2012}, note = {http://dx.doi.org/10.1016/j.csda.2012.04.018}, month = {December}, pages = {4165-4179}, doi = {10.1016/j.csda.2012.04.018}, url = {http://dx.doi.org/10.1016/j.csda.2012.04.018}, author = {Wang, J. and Holan, S.} } @booklet {Holan2012f, title = {Bayesian Multiscale Multiple Imputation With Implications to Data Confidentiality}, year = {2012}, note = {Texas A\&M University, January 2012; Duke University (Hosted by Duke Node), February 2012; Rice University, March 2012; Clemson University, April 2012}, author = {Holan, S.H.} } @conference {hal:ste:fie:2012, title = {Bayesian Parametric and Nonparametric Inference for Multiple Record Likage}, booktitle = {Modern Nonparametric Methods in Machine Learning Workshop}, year = {2012}, publisher = {NIPS}, organization = {NIPS}, url = {http://www.stat.cmu.edu/NCRN/PUBLIC/files/beka_nips_finalsub4.pdf}, author = {Hall, R. and Steorts, R. and Fienberg, S. E.} } @conference {2139, title = {Calendar interviewing in life course research: Associations between verbal behaviors and data quality}, booktitle = {Eighth International Conference on Social Science Methodology}, year = {2012}, address = {Sydney Australia}, url = {https://conference.acspri.org.au/index.php/rc33/2012/paper/view/366}, author = {Belli, R.F. and Bilgen, I. and T. Al Baghal} } @conference {Wikle2012c, title = {Change of Support in Spatio-Temporal Dynamical Models}, booktitle = {Joint Statistical Meetings}, year = {2012}, month = {August}, address = {Montreal, Canada}, author = {Wikle, C.K.} } @booklet {Charest2012a, title = {Confidentiality and Privacy Protection in a Non-US Census Context}, year = {2012}, month = {April}, publisher = {Carnegie Mellon University}, author = {Anne-Sophie Charest} } @conference {Feinberg2012d, title = {Counting the people}, booktitle = {Nathan and Beatrice Keyfitz Lecture in Mathematics and the Social Sciences}, year = {2012}, month = {May}, publisher = {Fields Institute}, organization = {Fields Institute}, address = {Toronto, Canada}, author = {Stephen E. Fienberg} } @mastersthesis {Charest2012, title = {Creation and Analysis of Differentially-Private Synthesis Datasets}, year = {2012}, note = {PhD Thesis, Department of Statistics}, school = {Carnegie Mellon University}, type = {phd}, author = {Anne-Sophie Charest} } @techreport {handle:1813:30924, title = {Data Management of Confidential Data}, number = {1813:30924}, year = {2012}, institution = {Cornell University}, type = {Preprint}, abstract = {Data Management of Confidential Data Lagoze, Carl; Block, William C.; Williams, Jeremy; Abowd, John M.; Vilhuber, Lars Social science researchers increasingly make use of data that is confidential because it contains linkages to the identities of people, corporations, etc. The value of this data lies in the ability to join the identifiable entities with external data such as genome data, geospatial information, and the like. However, the confidentiality of this data is a barrier to its utility and curation, making it difficult to fulfill US federal data management mandates and interfering with basic scholarly practices such as validation and reuse of existing results. We describe the complexity of the relationships among data that span a public and private divide. We then describe our work on the CED2AR prototype, a first step in providing researchers with a tool that spans this divide and makes it possible for them to search, access, and cite that data.}, url = {http://hdl.handle.net/1813/30924}, author = {Lagoze, Carl and Block, William C. and Williams, Jeremy and Abowd, John M. and Vilhuber, Lars} } @article {Xiaolin2012, title = {Differential Privacy for Protecting Multi-dimensional Contingency Table Data: Extensions and Applications}, journal = {Journal of Privacy and Confidentiality}, volume = {4}, number = {1}, year = {2012}, pages = {101-125}, author = {Yang Xiaolin and Stephen E. Fienberg and Alessandro Rinaldo} } @conference {Charest2012b, title = {Differential Privacy for Synthetic Datasets}, booktitle = {Proceedings of the Survey Research Section of the SSC}, year = {2012}, note = {Invited session on Confidentiality of the Annual Meeting of the Statistical Society of Canada}, address = {Guelph, Ontario}, author = {Anne-Sophie Charest} } @conference {Nugent2012, title = {Disambiguating USPTO Inventors with Classification Models Trained on Comparisons of Labeled Inventor Records}, booktitle = {Conference Presentation Classification Society Annual Meeting, Carnegie Mellon University}, year = {2012}, author = {Samuel Ventura and Rebecca Nugent and Erich R.H. Fuchs} } @techreport {handle:1813:30922, title = {An Early Prototype of the Comprehensive Extensible Data Documentation and Access Repository (CED2AR)}, number = {1813:30922}, year = {2012}, institution = {Cornell University}, type = {Preprint}, abstract = {An Early Prototype of the Comprehensive Extensible Data Documentation and Access Repository (CED2AR) Block, William C.; Williams, Jeremy; Abowd, John M.; Vilhuber, Lars; Lagoze, Carl This presentation will demonstrate the latest DDI-related technological developments of Cornell University{\textquoteright}s $3 million NSF-Census Research Network (NCRN) award, dedicated to improving the documentation, discoverability, and accessibility of public and restricted data from the federal statistical system in the United States. The current internal name for our DDI-based system is the Comprehensive Extensible Data Documentation and Access Repository (CED{\texttwosuperior}AR). CED{\texttwosuperior}AR ingests metadata from heterogeneous sources and supports filtered synchronization between restricted and public metadata holdings. Currently-supported CED{\texttwosuperior}AR {\textquotedblleft}connector workflows{\textquotedblright} include mechanisms to ingest IPUMS, zero-observation files from the American Community Survey (DDI 2.1), and SIPP Synthetic Beta (DDI 1.2). These disparate metadata sources are all transformed into a DDI 2.5 compliant form and stored in a single repository. In addition, we will demonstrate an extension to DDI 2.5 that allows for the labeling of elements within the schema to indicate confidentiality. This metadata can then be filtered, allowing the creation of derived public use metadata from an original confidential source. This repository is currently searchable online through a prototype application demonstrating the ability to search across previously heterogeneous metadata sources. Presentation at the 4th Annual European DDI User Conference (EDDI12), Norwegian Social Science Data Services, Bergen, Norway, 3 December, 2012}, url = {http://hdl.handle.net/1813/30922}, author = {Block, William C. and Williams, Jeremy and Abowd, John M. and Vilhuber, Lars and Lagoze, Carl} } @conference {Brandimarte2012, title = {The Economics of Privacy}, booktitle = {The Oxford Handbook of the Digital Economy}, year = {2012}, pages = {547-570}, publisher = {Oxford University Press}, organization = {Oxford University Press}, isbn = {9780195397840}, doi = {10.1093/oxfordhb/9780195397840.013.0020}, author = {Laura Brandimarte and Alessandro Acquisti}, editor = {Martin Peitz and Joel Waldfogel} } @booklet {Wikle2012b, title = {Efficient Time-Frequency Representations in High-Dimensional Spatial and Spatio-Temporal Models}, year = {2012}, month = {October}, author = {Wikle, C.K.} } @conference {Charest2012c, title = {Empirical Evaluation of Statistical Inference from Differentially-Private Contingency Tables}, booktitle = {Privacy in Statistical Databases}, volume = {7556}, year = {2012}, note = {Print ISBN is 978-3-642-33626-3}, pages = {257-272}, publisher = {Springer}, organization = {Springer}, isbn = {978-3-642-33627-0}, doi = {10.1007/978-3-642-33627-0_20}, author = {Anne-Sophie Charest}, editor = {Josep Domingo-Ferrer and Ilenia Tinnirello} } @techreport {handle:1813:55327, title = {Encoding Provenance Metadata for Social Science Datasets}, number = {1813:55327}, year = {2012}, institution = {Cornell University}, type = {Preprint}, abstract = {Encoding Provenance Metadata for Social Science Datasets Lagoze, Carl; Williams, Jeremy; Vilhuber, Lars Recording provenance is a key requirement for data-centric scholarship, allowing researchers to evaluate the integrity of source data sets and re- produce, and thereby, validate results. Provenance has become even more critical in the web environment in which data from distributed sources and of varying integrity can be combined and derived. Recent work by the W3C on the PROV model provides the foundation for semantically-rich, interoperable, and web-compatible provenance metadata. We apply that model to complex, but characteristic, provenance examples of social science data, describe scenarios that make scholarly use of those provenance descriptions, and propose a manner for encoding this provenance metadata within the widely-used DDI metadata standard. Submitted to Metadata and Semantics Research (MTSR 2013) conference.}, url = {http://hdl.handle.net/1813/55327}, author = {Lagoze, Carl and Williams, Jeremy and Vilhuber, Lars} } @inbook {NIPS2012_1456, title = {Entropy Estimations Using Correlated Symmetric Stable Random Projections}, booktitle = {Advances in Neural Information Processing Systems 25}, year = {2012}, pages = {3185{\textendash}3193}, url = {http://books.nips.cc/papers/files/nips25/NIPS2012_1456.pdf}, author = {Ping Li and Cun-Hui Zhang}, editor = {P. Bartlett and F.C.N. Pereira and C.J.C. Burges and L. Bottou and K.Q. Weinberger} } @article {Manrique-Vallier2012, title = {Estimating identification disclosure risk using mixed membership models}, journal = {Journal of the American Statistical Association}, volume = {107}, year = {2012}, pages = {1385-1394}, author = {Manrique-Vallier, D. and Reiter, J.P.} } @conference {Steorts2012a, title = {On Estimation of Mean Squared Errors of Benchmarked and Empirical Bayes Estimators}, booktitle = {2012 Joint Statistical Meetings}, year = {2012}, month = {August}, address = {San Diego, CA}, author = {Rebecca C. Steorts and Malay Ghosh} } @conference {2166, title = {Exploring interviewer and respondent interactions: An innovative behavior coding approach}, booktitle = {Midwest Association for Public Opinion Research 2012 Annual Conference}, year = {2012}, address = {Chicago, IL}, url = {http://www.mapor.org/conferences.html}, author = {Walton, L. and Stange, M. and Powell, R. and Belli, R.F.} } @booklet {Shaefer2012, title = {Extreme Poverty in the United States, 1996 to 2011}, year = {2012}, note = {NCRN}, month = {February 2012}, publisher = {University of Michigan}, type = {Report}, url = {http://www.npc.umich.edu/publications/policy_briefs/brief28/policybrief28.pdf}, author = {Shaefer, H. Luke and Edin, Kathryn} } @conference {CIKM-SunSL12, title = {Fast Multi-task Learning for Query Spelling Correction}, booktitle = {The 21$^{st}$ ACM International Conference on Information and Knowledge Management (CIKM 2012)}, year = {2012}, pages = {285{\textendash}294}, doi = {10.1145/2396761.2396800}, url = {http://dx.doi.org/10.1145/2396761.2396800}, author = {Xu Sun and Anshumali Shrivastava and Ping Li} } @conference {ShrivastavaL12, title = {Fast Near Neighbor Search in High-Dimensional Binary Data}, booktitle = {The European Conference on Machine Learning (ECML 2012)}, year = {2012}, author = {Anshumali Shrivastava and Ping Li} } @conference {Holan2012e, title = {Flexible Spectral Models for Multivariate Time Series}, booktitle = {Joint Statistical Meetings 2012}, year = {2012}, month = {August}, author = {Holan, S.H.} } @techreport {Sadinle2012b, title = {A Generalized Fellegi-Sunter Framework for Multiple Record Linkage with Application to Homicide Records Systems}, number = {1205.3217}, year = {2012}, url = {https://arxiv.org/abs/1205.3217}, author = {Mauricio Sadinle and Stephen E. Fienberg} } @conference {LiSK12, title = {GPU-based minwise hashing: GPU-based minwise hashing}, booktitle = {Proceedings of the 21st World Wide Web Conference (WWW 2012) (Companion Volume)}, year = {2012}, pages = {565-566}, doi = {10.1145/2187980.2188129}, url = {http://doi.acm.org/10.1145/2187980.2188129}, author = {Ping Li and Anshumali Shrivastava and Arnd Christian K{\"o}nig} } @conference {Wikle2012, title = {Hierarchical General Quadratic Nonlinear Models for Spatio-Temporal Dynamics}, booktitle = {Red Raider Conference}, year = {2012}, month = {October}, publisher = {Texas Tech University}, organization = {Texas Tech University}, address = {Lubbock, TX}, author = {Wikle, C.K.} } @booklet {Sengupta2012, title = {Hierarchical Statistical Modeling of Big Spatial Datasets Using the Exponential Family of Distributions}, number = {879}, year = {2012}, publisher = {The Ohio State University}, author = {Sengupta, A. and Cressie, N.} } @booklet {Cressie2012a, title = {Inference for Count Data using the Spatial Random Effects Model}, year = {2012}, month = {May}, author = {Cressie, N.} } @article {Reiter2012, title = {Inferentially valid partially synthetic data: Generating from posterior predictive distributions not necessary}, journal = {Journal of Official Statistics}, volume = {28}, year = {2012}, pages = {583-590}, author = {Reiter, J.P. and Kinney, S.K.} } @conference {Charoenruk2012, title = {Interviewer variance of interviewer and respondent behaviors: A new frontier in analyzing the interviewer-respondent interaction}, booktitle = {Midwest Association for Public Opinion Research 2012 Annual Conference}, year = {2012}, note = {Annual conference of the Midwest Association for Public Opinion Research, Chicago, Illinois.}, month = {November}, address = {Chicago, IL}, url = {http://www.mapor.org/conferences.html}, author = {Charoenruk, N. and Parkhurst, B. and Ay, M. and Belli, R. F.} } @conference {Sadinle2012a, title = {Logit-Based Confidence Intervals for Single Capture-Recapture Estimation}, booktitle = {American Statistical Association Pittsburgh Chapter Banquet}, year = {2012}, note = {April 9, 2012}, month = {April}, address = {Pittsburgh, PA}, author = {Mauricio Sadinle} } @conference {Shalizi-JSM2012, title = {Maintaining Quality in the Face of Rapid Program Expansion}, booktitle = {2012 Joint Statistical Meetings}, year = {2012}, month = {August}, address = {San Diego, CA}, author = {Cosma Shalizi and Rebecca Nugent} } @conference {Ventura2012b, title = {Methods Matter: Revamping Inventor Disambiguation Algorithms with Classification Models and Labeled Inventor Records}, booktitle = {Conference Presentation Academy of Management Annual Meeting}, year = {2012}, month = {August}, address = {Boston, MA}, author = {Samuel Ventura and Rebecca Nugent and Erich R.H. Fuchs} } @conference {Sadinle2012, title = {MulFiles Record Linkage Using a Generalized Fellegi-Sunter Framework}, booktitle = {Conference Presentation Classification Society Annual Meeting, Carnegie Mellon University}, year = {2012}, author = {Mauricio Sadinle} } @techreport {handle:1813:45884, title = {NCRN Meeting Fall 2012}, number = {1813:45884}, year = {2012}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Fall 2012 Vilhuber, Lars Taken place at the Census Bureau Headquarters, Suitland, MD.}, url = {http://hdl.handle.net/1813/45884}, author = {Vilhuber, Lars} } @techreport {handle:1813:30925, title = {The NSF-Census Research Network: Cornell Node}, number = {1813:30925}, year = {2012}, institution = {Cornell University}, type = {Preprint}, abstract = {The NSF-Census Research Network: Cornell Node Block, William C.; Lagoze, Carl; Vilhuber, Lars; Brown, Warren A.; Williams, Jeremy; Arguillas, Florio Cornell University has received a $3M NSF-Census Research Network (NCRN) award to improve the documentation and discoverability of both public and restricted data from the federal statistical system. The current internal name for this project is the Comprehensive Extensible Data Documentation and Access Repository (CED{\texttwosuperior}AR). The diagram to the right provides a high level architectural overview of the system to be implemented. The CED{\texttwosuperior}AR will be based upon leading metadata standards such as the Data Documentation Initiative (DDI) and Statistical Data and Metadata eXchange (SDMX) and be flexibly designed to ingest documentation from a variety of source files. It will permit synchronization between the public and confidential instances of the repository. The scholarly community will be able to use the CED{\texttwosuperior}AR as it would a conventional metadata repository, deprived only of the values of certain confidential information, but not their metadata. The authorized user, working on the secure Census Bureau network, could use the CED{\texttwosuperior}AR with full information in authorized domains.}, url = {http://hdl.handle.net/1813/30925}, author = {Block, William C. and Lagoze, Carl and Vilhuber, Lars and Brown, Warren A. and Williams, Jeremy and Arguillas, Florio} } @inbook {NIPS2012_1436, title = {One Permutation Hashing}, booktitle = {Advances in Neural Information Processing Systems 25}, year = {2012}, pages = {3122{\textendash}3130}, url = {http://books.nips.cc/papers/files/nips25/NIPS2012_1436.pdf}, author = {Ping Li and Art Owen and Cun-Hui Zhang}, editor = {P. Bartlett and F.C.N. Pereira and C.J.C. Burges and L. Bottou and K.Q. Weinberger} } @techreport {handle:1813:30937, title = {Presentation: Revisiting the Economics of Privacy: Population Statistics and Privacy as Public Goods}, number = {1813:30937}, year = {2012}, institution = {Cornell University}, type = {Preprint}, abstract = {Presentation: Revisiting the Economics of Privacy: Population Statistics and Privacy as Public Goods Abowd, John Anonymization and data quality are intimately linked. Although this link has been properly acknowledged in the Computer Science and Statistical Disclosure Limitation literatures, economics offers a framework for formalizing the linkage and analyzing optimal decisions and equilibrium outcomes. The opinions expressed in this presentation are those of the author and neither the National Science Foundation nor the Census Bureau.}, url = {http://hdl.handle.net/1813/30937}, author = {Abowd, John} } @article {Fienberg2012, title = {Privacy in a world of electronic data: Whom should you trust?}, journal = {Notices of the AMS}, volume = {59}, year = {2012}, pages = {479}, author = {Stephen E. Fienberg} } @article {Fienberg2012a, title = {Privacy-preserving data sharing in high dimensional regression and classification settings}, journal = {Journal of Privacy and Confidentiality}, volume = {4}, year = {2012}, pages = {221}, author = {Stephen E. Fienberg and Jiashun Jin} } @inbook {raey, title = {A Proposed Solution to the Archiving and Curation of Confidential Scientific Inputs}, booktitle = {Privacy in Statistical Databases}, series = {Lecture Notes in Computer Science}, volume = {7556}, year = {2012}, pages = {216-225}, publisher = {Springer Berlin Heidelberg}, organization = {Springer Berlin Heidelberg}, keywords = {Data Archive, Data Curation, Privacy-preserving Datamining, Statistical Disclosure Limitation}, isbn = {978-3-642-33626-3}, doi = {10.1007/978-3-642-33627-0_17}, url = {http://dx.doi.org/10.1007/978-3-642-33627-0_17}, author = {Abowd, John M. and Vilhuber, Lars and Block, William}, editor = {Domingo-Ferrer, Josep and Tinnirello, Ilenia} } @conference {WWW-SunSL12, title = {Query spelling correction using multi-task learning}, booktitle = {Proceedings of the 21st World Wide Web Conference (WWW 2012)(Companion Volume)}, year = {2012}, pages = {613-614}, doi = {10.1145/2187980.2188153}, url = {http://doi.acm.org/10.1145/2187980.2188153}, author = {Xu Sun and Anshumali Shrivastava and Ping Li} } @article {Holan2012b, title = {Rejoinder: An approach for identifying and predicting economic recessions in real time using time frequency functional models}, journal = {Applied Stochastic Models in Business and Industry}, volume = {28}, year = {2012}, pages = {504-505}, doi = {10.1002/asmb.1955}, url = {http://onlinelibrary.wiley.com/doi/10.1002/asmb.1955/full}, author = {Holan, S. and Yang, W. and Matteson, D. and Wikle, C.} } @inbook {Holan2012a, title = {Semiparametric Dynamic Design of Monitoring Networks for Non-Gaussian Spatio-Temporal Data}, booktitle = {Spatio-temporal Design: Advances in Efficient Data Acquisition}, year = {2012}, pages = {269-284}, publisher = {Wiley}, organization = {Wiley}, address = {Chichester, UK}, doi = {10.1002/9781118441862.ch12}, url = {http://onlinelibrary.wiley.com/doi/10.1002/9781118441862.ch12/summary}, author = {Holan, S. and Wikle, C.K.}, editor = {Jorge Mateu and Werner Muller} } @conference {Adjerid2012, title = {Sleight of Privacy}, booktitle = {Conference on Web Privacy Measurement}, year = {2012}, author = {Idris Adjerid and Alessandro Acquisti and Laura Brandimarte} } @mastersthesis {Kurtz2012, title = {Smooth Post-Stratification in Multiple Capture Recapture}, year = {2012}, note = {Department of Statistics}, school = {Carnegie Mellon University}, type = {phd}, author = {Zachary Kurtz} } @booklet {Wikle2012a, title = {Spatio-Temporal Statistics at Mizzou, Truman School of Public Affairs}, year = {2012}, month = {October}, author = {Wikle, C.K.} } @conference {Stephe2012, title = {Statistics in Service to the Nation}, booktitle = {Presentation Samuel S. Wilks Lecture}, year = {2012}, note = {April 23, 2012}, month = {April}, address = {Princeton, NJ}, author = {Stephen E. Fienberg} } @conference {Feinberg-JSM2012, title = {Teaching about Big Data: Curricular Issues}, booktitle = {2012 Joint Statistical Meetings}, year = {2012}, month = {August}, address = {San Diego, CA}, author = {Stephen E. Fienberg} } @article {SrivastavaLS12, title = {Testing for Membership to the IFRA and the NBU Classes of Distributions}, journal = {Journal of Machine Learning Research - Proceedings Track for the Fifteenth International Conference on Artificial Intelligence and Statistics (AISTATS 2012)}, volume = {22}, year = {2012}, pages = {1099-1107}, url = {http://jmlr.csail.mit.edu/proceedings/papers/v22/srivastava12.html}, author = {Radhendushka Srivastava and Ping Li and Debasis Sengupta} } @conference {Spielman2012, title = {Thinking inside the box: Mapping the microstructure of urban environment (and why it matters)}, booktitle = {AutoCarto 2012}, year = {2012}, address = {Columbus, Ohio}, keywords = {cartography}, url = {http://www.cartogis.org/docs/proceedings/2012/Spielman_etal_AutoCarto2012.pdf}, author = {Seth Spielman and David Folch and John Logan and Nicholas Nagle} } @conference {Phillips2012, title = {Troubles with time-use: Examining potential indicators of error in the ATUS}, booktitle = {Midwest Association for Public Opinion Research 2012 Annual Conference}, year = {2012}, note = {Presented at the annual conference of the Midwest Association for Public Opinion Research, Chicago, Illinois}, address = {Chicago, IL}, url = {http://www.mapor.org/conferences.html}, author = {Phillips, A. L., and T. Al Baghal and Belli, R. F.} } @conference {RobertHall2012, title = {Valid Statistical Inference on Automatically Matched Files}, booktitle = {Privacy in Statistical Databases}, year = {2012}, pages = {131{\textendash}142}, publisher = {Springer}, organization = {Springer}, doi = {10.1007/978-3-642-33627-0_11}, author = {Robert Hall and Stephen E. Fienberg}, editor = {Josep Domingo-Ferrer and Ilenia Tinnirello} } @article {ShaeferYbarra2012, title = {The welfare reforms of the 1990s and the stratification of material well-being among low-income households with children}, journal = {Children and Youth Services Review}, volume = {34}, number = {8}, year = {2012}, note = {NCRN}, pages = {1810-1817}, type = {Journal Article}, abstract = {

We examine the incidence of material hardship experienced by low-income households with children, before and after the major changes to U.S. anti-poverty programs during the 1990s. We use the Survey of Income and ProgramParticipation (SIPP) to examine a series of measures of householdmaterial hardship thatwere collected in the years 1992, 1995, 1998, 2003 and 2005.We stratify our sample to differentiate between the 1) deeply poor (b50\% of poverty), who sawa decline in public assistance over this period; and two groups that sawsome forms of public assistance increase: 2) other poor households (50{\textendash}99\% of poverty), and 3) the near poor (100{\textendash}150\% of poverty). We report bivariate trends over the study period, as well as presenting multivariate difference-indifferences estimates.We find suggestive evidence that material hardship{\textemdash}in the form of difficulty meeting essential household expenses, and falling behind on utilities costs{\textemdash}has generally increased among the deeply poor but has remained roughly the same for the middle group (50{\textendash}99\% of poverty), and decreased among the near poor (100{\textendash}150\% of poverty). Multivariate difference-in-differences estimates suggest that these trends have resulted in intensified stratification of the material well-being of low-income households with children.

}, author = {Shaefer, H. Luke and Ybarra, Marci} } @conference {sad:hal:fie:2011, title = {Approaches to Multiple Record Linkage}, booktitle = {Proceedings of the 58th World Statistical Congress}, year = {2011}, pages = {1064{\textendash}1071}, publisher = {International Statistical Institute}, organization = {International Statistical Institute}, address = {Dublin}, url = {http://2011.isiproceedings.org/papers/450092.pdf}, author = {Sadinle, M. and Hall, R. and Fienberg, S. E.} } @article {Fienberg2011, title = {Comment on Gates: Toward a Reconceptualization of Confidentiality Protection in the Context of Linkages with Administrative Records}, journal = {Journal of Privacy and Confidentiality}, volume = {3}, year = {2011}, pages = {65}, author = {Stephen E. Fienberg} } @techreport {handle:1813:34516, title = {Do Single Mothers in the United States use the Earned Income Tax Credit to Reduce Unsecured Debt?}, number = {1813:34516}, year = {2011}, institution = {University of Michigan}, type = {Preprint}, abstract = {Do Single Mothers in the United States use the Earned Income Tax Credit to Reduce Unsecured Debt? Shaefer, H. Luke; Song, Xiaoqing; Williams Shanks, Trina R. The Earned Income Tax Credit (EITC) is a refundable credit for low-income workers that is mainly targeted at families with children. This study uses the Survey of Income and Program Participation{\textquoteright}s (SIPP) topical modules on Assets \& Liabilities to examine the effects of EITC expansions during the early 1990s on the unsecured debt of the households of single mothers. We use two difference-in-differences comparisons over the study period 1988 to 1999, first comparing single mothers to single childless women, and then comparing single mothers with two or more children to single mothers with exactly one child. In both cases we find that the EITC expansions are associated with a relative decline in the unsecured debt of affected households of single mothers. This suggests that single mothers may have used part of their EITC to limit the growth of their unsecured debt during this period.}, url = {http://hdl.handle.net/1813/34516}, author = {Shaefer, H. Luke and Song, Xiaoqing and Williams Shanks, Trina R.} } @techreport {handle:1813:33184, title = {Estimating identification disclosure risk using mixed membership models}, number = {1813:33184}, year = {2011}, institution = {Duke University / National Institute of Statistical Sciences (NISS)}, type = {Preprint}, abstract = {Estimating identification disclosure risk using mixed membership models Manrique-Vallier, Daniel; Reiter, Jerome Statistical agencies and other organizations that disseminate data are obligated to protect data subjects{\textquoteright} confi dentiality. For example, ill-intentioned individuals might link data subjects to records in other databases by matching on common characteristics (keys). Successful links are particularly problematic for data subjects with combinations of keys that are unique in the population. Hence, as part of their assessments of disclosure risks, many data stewards estimate the probabilities that sample uniques on sets of discrete keys are also population uniques on those keys. This is typically done using log-linear modeling on the keys. However, log-linear models can yield biased estimates of cell probabilities for sparse contingency tables with many zero counts, which often occurs in databases with many keys. This bias can result in unreliable estimates of probabilities of uniqueness and, hence, misrepresentations of disclosure risks. We propose an alternative to log-linear models for datasets with sparse keys based on a Bayesian version of grade of membership (GoM) models. We present a Bayesian GoM model for multinomial variables and off er an MCMC algorithm for fitting the model. We evaluate the approach by treating data from a recent US Census Bureau public use microdata sample as a population, taking simple random samples from that population, and benchmarking estimated probabilities of uniqueness against population values. Compared to log-linear models, GoM models provide more accurate estimates of the total number of uniques in the samples. Additionally, they offer record-level predictions of uniqueness that dominate those based on log-linear models.}, url = {http://hdl.handle.net/1813/33184}, author = {Manrique-Vallier, Daniel and Reiter, Jerome} } @techreport {handle:1813:46201, title = {NCRN Meeting Fall 2011}, number = {1813:46201}, year = {2011}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Fall 2011 Vilhuber, Lars Taken place at Census Bureau Conference Center.}, url = {http://hdl.handle.net/1813/46201}, author = {Vilhuber, Lars} } @techreport {handle:1813:30923, title = {A Proposed Solution to the Archiving and Curation of Confidential Scientific Inputs}, number = {1813:30923}, year = {2011}, institution = {Cornell University}, type = {Preprint}, abstract = {A Proposed Solution to the Archiving and Curation of Confidential Scientific Inputs Abowd, John M.; Vilhuber, Lars; Block, William We develop the core of a method for solving the data archive and curation problem that confronts the custodians of restricted-access research data and the scientific users of such data. Our solution recognizes the dual protections afforded by physical security and access limitation protocols. It is based on extensible tools and can be easily incorporated into existing instructional materials.}, url = {http://hdl.handle.net/1813/30923}, author = {Abowd, John M. and Vilhuber, Lars and Block, William} } @article {Fienberg2011a, title = {Secure multiparty linear regression based on homomorphic encryption}, journal = {Journal of Official Statistics}, volume = {27}, year = {2011}, pages = {669}, author = {Robert Hall and Stephen E. Fienberg and Yuval Nardi} } @article {2497, title = {Parallel Associations and the Structure of Autobiographical Knowledge}, journal = {Journal of Applied Research in Memory and Cognition}, volume = {5}, year = {6}, month = {6//}, pages = {150-157}, abstract = {The self-memory system (SMS) model of autobiographical knowledge conceives that memories are structured thematically, organized both hierarchically and temporally. This model has been challenged on several fronts, including the absence of parallel linkages across pathways. Calendar survey interviewing shows the frequent and varied use of parallel associations in autobiographical recall. Parallel associations in these data are commonplace, and are driven more by respondents{\textquoteright} generative retrieval than by interviewers{\textquoteright} probing. Parallel associations represent a number of autobiographical knowledge themes that are interrelated across life domains. The content of parallel associations is nearly evenly split between general and transitional events, supporting the importance of transitions in autographical memory. Associations in respondents{\textquoteright} memories (both parallel and sequential), demonstrate complex interactions with interviewer verbal behaviors during generative retrieval. In addition to discussing the implications of these results to the SMS model, implications are also drawn for transition theory and the basic-systems model.}, keywords = {Autobiographical knowledge, Autobiographical memory, Autobiographical periods, Episodic memory, Retrospective reports}, isbn = {2211-3681}, url = {http://www.sciencedirect.com/science/article/pii/S2211368116300183}, author = {Belli, Robert F. and Al Baghal, Tarek} } @booklet {2518, title = {Are Self-Description Scales Better than Agree/Disagree Scales in Mail and Telephone Surveys?}, author = {Timbrook, Jerry and Smyth, Jolene D. and Olson, Kristen} } @booklet {2519, title = {Are Self-Description Scales Better than Agree/Disagree Scales in Mail and Telephone Surveys?}, author = {Timbrook, Jerry and Smyth, Jolene D. and Olson, Kristen} } @booklet {2522, title = {The ATUS and SIPP-EHC: Recent developments}, author = {Belli, R. F.} } @booklet {2529, title = {Audit trails, parallel navigation, and the SIPP}, author = {Lee, Jinyoung} } @article {2263, title = {Bayesian estimation of bipartite matchings for record linkage}, journal = {Journal of the American Statistical Association}, abstract = {The bipartite record linkage task consists of merging two disparate datafiles containing information on two overlapping sets of entities. This is non-trivial in the absence of unique identifiers and it is important for a wide variety of applications given that it needs to be solved whenever we have to combine information from different sources. Most statistical techniques currently used for record linkage are derived from a seminal paper by Fellegi and Sunter (1969). These techniques usually assume independence in the matching statuses of record pairs to derive estimation procedures and optimal point estimators. We argue that this independence assumption is unreasonable and instead target a bipartite matching between the two datafiles as our parameter of interest. Bayesian implementations allow us to quantify uncertainty on the matching decisions and derive a variety of point estimators using different loss functions. We propose partial Bayes estimates that allow uncertain parts of the bipartite matching to be left unresolved. We evaluate our approach to record linkage using a variety of challenging scenarios and show that it outperforms the traditional methodology. We illustrate the advantages of our methods merging two datafiles on casualties from the civil war of El Salvador.}, author = {Mauricio Sadinle} } @article {2662, title = {Biomass prediction using density dependent diameter distribution models}, journal = {Annals of Applied Statistics}, volume = {11}, pages = {340-361}, abstract = {Prediction of aboveground biomass, particularly at large spatial scales, is necessary for estimating global-scale carbon sequestration. Since biomass can be measured only by sacrificing trees, total biomass on plots is never observed. Rather, allometric equations are used to convert individual tree diameter to individual biomass, perhaps with noise. The values for all trees on a plot are then summed to obtain a derived total biomass for the plot. Then, with derived total biomasses for a collection of plots, regression models, using appropriate environmental covariates, are employed to attempt explanation and prediction. Not surprisingly, when out-of-sample validation is examined, such a model will predict total biomass well for holdout data because it is obtained using exactly the same derived approach. Apart from the somewhat circular nature of the regression approach, it also fails to employ the actual observed plot level response data. At each plot, we observe a random number of trees, each with an associated diameter, producing a sample of diameters. A model based on this random number of tree diameters provides understanding of how environmental regressors explain abundance of individuals, which in turn explains individual diameters. We incorporate density dependence because the distribution of tree diameters over a plot of fixed size depends upon the number of trees on the plot. After fitting this model, we can obtain predictive distributions for individual-level biomass and plot-level total biomass. We show that predictive distributions for plot-level biomass obtained from a density-dependent model for diameters will be much different from predictive distributions using the regression approach. Moreover, they can be more informative for capturing uncertainty than those obtained from modeling derived plot-level biomass directly. We develop a density-dependent diameter distribution model and illustrate with data from the national Forest Inventory and Analysis (FIA) database. We also describe how to scale predictions to larger spatial regions. Our predictions agree (in magnitude) with available wisdom on mean and variation in biomass at the hectare scale.}, url = {https://projecteuclid.org/euclid.aoas/1491616884}, author = {Schliep, E.M. and A.E. Gelfand and J.S. Clark and B.J. Tomasek} } @inbook {2523, title = {Calendar and time diary methods: The tools to assess well-being in the 21st century}, booktitle = {Handbook of research methods in health and social sciences}, publisher = {Springer}, organization = {Springer}, author = {C{\'o}rdova Cazar, Ana Luc{\'\i}a and Belli, Robert F.}, editor = {Liamputtong, P} } @booklet {2526, title = {Does relation of retrieval pathways to data quality differ by self or proxy response status?}, author = {Lee, Jinyoung and Belli, Robert F.} } @booklet {2512, title = {"During the LAST YEAR, Did You...": The Effect of Emphasis in CATI Survey Questions on Data Quality}, author = {Olson, Kristen and Smyth, Jolene D.} } @booklet {2511, title = {"During the LAST YEAR, Did You...": The Effect of Emphasis in CATI Survey Questions on Data Quality}, author = {Olson, Kristen and Smyth, Jolene D.} } @booklet {2510, title = {The Effect of Question Characteristics, Respondents and Interviewers on Question Reading Time and Question Reading Behaviors in CATI Surveys}, author = {Olson, Kristen and Smyth, Jolene and Kirchner, Antje} } @booklet {2508, title = {The Effect of Question Characteristics, Respondents and Interviewers on Question Reading Time and Question Reading Behaviors in CATI Surveys}, author = {Olson, Kristen} } @booklet {2503, title = {The Effects of Respondent and Question Characteristics on Respondent Behaviors}, author = {Ganshert, Amanda and Olson, Kristen and Smyth, Jolene} } @article {doi:10.1080/00031305.2016.1277158, title = {An Empirical Comparison of Multiple Imputation Methods for Categorical Data}, journal = {The American Statistician}, number = {ja}, pages = {0-0}, abstract = {AbstractMultiple imputation is a common approach for dealing with missing values in statistical databases. The imputer fills in missing values with draws from predictive models estimated from the observed data, resulting in multiple, completed versions of the database. Researchers have developed a variety of default routines to implement multiple imputation; however, there has been limited research comparing the performance of these methods, particularly for categorical data. We use simulation studies to compare repeated sampling properties of three default multiple imputation methods for categorical data, including chained equations using generalized linear models, chained equations using classification and regression trees, and a fully Bayesian joint distribution based on Dirichlet Process mixture models. We base the simulations on categorical data from the American Community Survey. In the circumstances of this study, the results suggest that default chained equations approaches based on generalized linear models are dominated by the default regression tree and Bayesian mixture model approaches. They also suggest competing advantages for the regression tree and Bayesian mixture model approaches, making both reasonable default engines for multiple imputation of categorical data. A supplementary material for this article is available online.}, doi = {10.1080/00031305.2016.1277158}, url = {http://dx.doi.org/10.1080/00031305.2016.1277158}, author = {Olanrewaju Akande and Fan Li and Jerome Reiter} } @article {2661, title = {An ensemble quadratic echo state network for nonlinear spatio-temporal forecasting}, journal = {Stat}, abstract = {Spatio-temporal data and processes are prevalent across a wide variety of scientific disciplines. These processes are often characterized by nonlinear time dynamics that include interactions across multiple scales of spatial and temporal variability. The data sets associated with many of these processes are increasing in size due to advances in automated data measurement, management, and numerical simulator output. Non- linear spatio-temporal models have only recently seen interest in statistics, but there are many classes of such models in the engineering and geophysical sciences. Tradi- tionally, these models are more heuristic than those that have been presented in the statistics literature, but are often intuitive and quite efficient computationally. We show here that with fairly simple, but important, enhancements, the echo state net- work (ESN) machine learning approach can be used to generate long-lead forecasts of nonlinear spatio-temporal processes, with reasonable uncertainty quantification, and at only a fraction of the computational expense of a traditional parametric nonlinear spatio-temporal models.}, url = {https://arxiv.org/abs/1708.05094}, author = {McDermott, P.L. and Wikle, C.K.} } @booklet {2525, title = {Evaluating Data quality in Time Diary Surveys Using Paradata}, author = {C{\'o}rdova Cazar, Ana Luc{\'\i}a and Belli, Robert F.} } @booklet {2524, title = {An evaluation study of the use of paradata to enhance data quality in the American Time Use Survey (ATUS)}, author = {C{\'o}rdova Cazar, Ana Luc{\'\i}a and Belli, Robert F.} } @booklet {2528, title = {Event History Calendar Interviewing Dynamics and Data Quality in the Survey of Income and Program Participation}, author = {Lee, Jinyoung} } @booklet {2506, title = {Going off Script: How Interviewer Behavior Affects Respondent Behaviors in Telephone Surveys}, author = {Kirchner, Antje and Olson, Kristen and Smyth, Jolene} } @booklet {2514, title = {How do Low Versus High Response Scale Ranges Impact the Administration and Answering of Behavioral Frequency Questions in Telephone Surveys?}, author = {Sarwar, Mazen and Olson, Kristen and Smyth, Jolene} } @booklet {2516, title = {How do Mismatches Affect Interviewer/Respondent Interactions in the Question/Answer Process?}, author = {Smyth, Jolene D. and Olson, Kristen} } @booklet {2501, title = {Interviewer Influence on Interviewer-Respondent Interaction During Battery Questions}, author = {Cochran, Beth and Olson, Kristen and Smyth, Jolene} } @booklet {2504, title = {Memory Gaps in the American Time Use Survey. Are Respondents Forgetful or is There More to it?}, author = {Kirchner, Antje and Belli, Robert F. and Deal, Caitlin E. and C{\'o}rdova-Cazar, Ana Lucia} } @booklet {2527, title = {Relation of questionnaire navigation patterns and data quality: Keystroke data analysis}, author = {Lee, Jinyoung} } @booklet {2521, title = {Respondent retrieval strategies inform the structure of autobiographical knowledge}, author = {Belli, R. F.} } @booklet {2513, title = {Response Scales: Effects on Data Quality for Interviewer Administered Surveys}, author = {Sarwar, Mazen and Olson, Kristen and Smyth, Jolene} } @booklet {2530, title = {Using audit trails to evaluate an event history calendar survey instrument}, author = {Lee, Jinyoung and Seloske, Ben and Belli, Robert F.} } @booklet {2520, title = {Using behavior coding to understand respondent retrieval strategies that inform the structure of autobiographical knowledge}, author = {Belli, R. F.} } @booklet {2517, title = {Why do Mobile Interviews Take Longer? A Behavior Coding Perspective}, author = {Timbrook, Jerry and Smyth, Jolene and Olson, Kristen} } @booklet {2531, title = {Working with the SIPP-EHC audit trails: Parallel and sequential retrieval}, author = {Lee, Jinyoung and Seloske, Ben and C{\'o}rdova Cazar, Ana Luc{\'\i}a and Eck, Adam and Belli, Robert F.} }