@article {2559, title = {Imputation in U.S. Manufacturing Data and Its Implications for Productivity Dispersion}, journal = {Review of Economics and Statistics}, year = {Submitted}, abstract = {In the U.S. Census Bureau{\textquoteright}s 2002 and 2007 Censuses of Manufactures 79\% and 73\% of observations respectively have imputed data for at least one variable used to compute total factor productivity. The Bureau primarily imputes for missing values using mean-imputation methods which can reduce the true underlying variance of the imputed variables. For every variable entering TFP in 2002 and 2007 we show the dispersion is significantly smaller in the Census mean-imputed versus the Census non-imputed data. As an alternative to mean imputation we show how to use classification and regression trees (CART) to allow for a distribution of multiple possible impute values based on other plants that are CART-algorithmically determined to be similar based on other observed variables. For 90\% of the 473 industries in 2002 and the 84\% of the 471 industries in 2007 we find that TFP dispersion increases as we move from Census mean-imputed data to Census non-imputed data to the CART-imputed data.}, doi = {10.1162/REST_a_00678 }, url = {http://www.mitpressjournals.org/doi/abs/10.1162/REST_a_00678}, author = {T. Kirk White and Jerome P. Reiter and Amil Petrin} } @article {2634, title = {The Earned Income Tax Credit and Food Insecurity: Who Benefits?}, year = {forthcoming}, author = {Shaefer, H.L. and Wilson, R.} } @article {2663, title = {Adaptively-Tuned Particle Swarm Optimization with Application to Spatial Design}, journal = {Stat}, volume = {6}, year = {2017}, pages = {145{\textendash}159}, abstract = {Particle swarm optimization (PSO) algorithms are a class of heuristic optimization algorithms that are attractive for complex optimization problems. We propose using PSO to solve spatial design problems, e.g. choosing new locations to add to an existing monitoring network. Additionally, we introduce two new classes of PSO algorithms that perform well in a wide variety of circumstances, called adaptively tuned PSO and adaptively tuned bare bones PSO. To illustrate these algorithms, we apply them to a common spatial design problem: choosing new locations to add to an existing monitoring network. Specifically, we consider a network in the Houston, TX, area for monitoring ambient ozone levels, which have been linked to out-of-hospital cardiac arrest rates. Published 2017. This article has been contributed to by US Government employees and their work is in the public domain in the USA}, doi = {10.1002/sta4.142}, url = {http://onlinelibrary.wiley.com/doi/10.1002/sta4.142/abstract}, author = {Simpson, M. and Wikle, C.K. and Holan, S.H.} } @article {2664, title = {Bayesian Hierarchical Multi-Population Multistate Jolly-Seber Models with Covariates: Application to the Pallid Sturgeon Population Assessment Program}, journal = {Journal of the American Statistical Association}, volume = {112}, year = {2017}, pages = {471-483}, abstract = {Estimating abundance for multiple populations is of fundamental importance to many ecological monitoring programs. Equally important is quantifying the spatial distribution and characterizing the migratory behavior of target populations within the study domain. To achieve these goals, we propose a Bayesian hierarchical multi-population multistate Jolly{\textendash}Seber model that incorporates covariates. The model is proposed using a state-space framework and has several distinct advantages. First, multiple populations within the same study area can be modeled simultaneously. As a consequence, it is possible to achieve improved parameter estimation by {\textquotedblleft}borrowing strength{\textquotedblright} across different populations. In many cases, such as our motivating example involving endangered species, this borrowing of strength is crucial, as there is relatively less information for one of the populations under consideration. Second, in addition to accommodating covariate information, we develop a computationally efficient Markov chain Monte Carlo algorithm that requires no tuning. Importantly, the model we propose allows us to draw inference on each population as well as on multiple populations simultaneously. Finally, we demonstrate the effectiveness of our method through a motivating example of estimating the spatial distribution and migration of hatchery and wild populations of the endangered pallid sturgeon (Scaphirhynchus albus), using data from the Pallid Sturgeon Population Assessment Program on the Lower Missouri River. Supplementary materials for this article are available online.}, doi = {10.1080/01621459.2016.1211531}, url = {http://www.tandfonline.com/doi/abs/10.1080/01621459.2016.1211531}, author = {Wu, G. and Holan, S.H.} } @article {2658, title = {The Cepstral Model for Multivariate Time Series: The Vector Exponential Model}, journal = {Statistica Sinica}, volume = {27}, year = {2017}, pages = {23-42}, abstract = {Vector autoregressive (VAR) models have become a staple in the analysis of multivariate time series and are formulated in the time domain as difference equations, with an implied covariance structure. In many contexts, it is desirable to work with a stable, or at least stationary, representation. To fit such models, one must impose restrictions on the coefficient matrices to ensure that certain determinants are nonzero; which, except in special cases, may prove burdensome. To circumvent these difficulties, we propose a flexible frequency domain model expressed in terms of the spectral density matrix. Specifically, this paper treats the modeling of covariance stationary vector-valued (i.e., multivariate) time series via an extension of the exponential model for the spectrum of a scalar time series. We discuss the modeling advantages of the vector exponential model and its computational facets, such as how to obtain Wold coefficients from given cepstral coefficients. Finally, we demonstrate the utility of our approach through simulation as well as two illustrative data examples focusing on multi-step ahead forecasting and estimation of squared coherence.}, keywords = {Autocovariance matrix, Bayesian estimation, Cepstral, Coherence, Spectral density matrix, stochastic search variable selection, Wold coefficients.}, doi = {10.5705/ss.202014.0024}, url = {http://www3.stat.sinica.edu.tw/statistica/J27N1/J27N12/J27N12.html}, author = {Holan, S.H. and McElroy, T.S. and Wu, G.} } @techreport {2655, title = {Computationally Efficient Multivariate Spatio-Temporal Models for High-Dimensional Count-Valued Data. (With Discussion).}, number = {1512.07273}, year = {2017}, abstract = {We introduce a Bayesian approach for multivariate spatio-temporal prediction for high-dimensional count-valued data. Our primary interest is when there are possibly millions of data points referenced over different variables, geographic regions, and times. This problem requires extensive methodological advancements, as jointly modeling correlated data of this size leads to the so-called "big n problem." The computational complexity of prediction in this setting is further exacerbated by acknowledging that count-valued data are naturally non-Gaussian. Thus, we develop a new computationally efficient distribution theory for this setting. In particular, we introduce a multivariate log-gamma distribution and provide substantial theoretical development including: results regarding conditional distributions, marginal distributions, an asymptotic relationship with the multivariate normal distribution, and full-conditional distributions for a Gibbs sampler. To incorporate dependence between variables, regions, and time points, a multivariate spatio-temporal mixed effects model (MSTM) is used. The results in this manuscript are extremely general, and can be used for data that exhibit fewer sources of dependency than what we consider (e.g., multivariate, spatial-only, or spatio-temporal-only data). Hence, the implications of our modeling framework may have a large impact on the general problem of jointly modeling correlated count-valued data. We show the effectiveness of our approach through a simulation study. Additionally, we demonstrate our proposed methodology with an important application analyzing data obtained from the Longitudinal Employer-Household Dynamics (LEHD) program, which is administered by the U.S. Census Bureau.}, keywords = {Aggregation, American Community Survey, Bayesian hierarchical model, Big Data, Longitudinal Employer-Household Dynamics (LEHD) program, Markov chain Monte Carlo, Non-Gaussian., Quarterly Workforce Indicators}, url = {https://arxiv.org/abs/1512.07273}, author = {Bradley, J.R. and Holan, S.H. and Wikle, C.K.} } @article {Hu2017-nm, title = {Dirichlet Process Mixture Models for Modeling and Generating Synthetic Versions of Nested Categorical Data}, journal = {Bayesian Analysis}, year = {2017}, month = {24 January 2017}, abstract = {We present a Bayesian model for estimating the joint distribution of multivariate categorical data when units are nested within groups. Such data arise frequently in social science settings, for example, people living in households. The model assumes that (i) each group is a member of a group-level latent class, and (ii) each unit is a member of a unit-level latent class nested within its group-level latent class. This structure allows the model to capture dependence among units in the same group. It also facilitates simultaneous modeling of variables at both group and unit levels. We develop a version of the model that assigns zero probability to groups and units with physically impossible combinations of variables. We apply the model to estimate multivariate relationships in a subset of the American Community Survey. Using the estimated model, we generate synthetic household data that could be disseminated as redacted public use files. Supplementary materials (Hu et al., 2017) for this article are available online.}, doi = {10.1214/16-BA1047}, url = {http://projecteuclid.org/euclid.ba/1485227030}, author = {Hu, Jingchen and Reiter, Jerome P and Wang, Quanli} } @techreport {handle:1813:52650, title = {Effects of a Government-Academic Partnership: Has the NSF-Census Bureau Research Network Helped Secure the Future of the Federal Statistical System?}, number = {1813:52650}, year = {2017}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {

Effects of a Government-Academic Partnership: Has the NSF-Census Bureau Research Network Helped Secure the Future of the Federal Statistical System? Weinberg, Daniel; Abowd, John M.; Belli, Robert F.; Cressie, Noel; Folch, David C.; Holan, Scott H.; Levenstein, Margaret C.; Olson, Kristen M.; Reiter, Jerome P.; Shapiro, Matthew D.; Smyth, Jolene; Soh, Leen-Kiat; Spencer, Bruce; Spielman, Seth E.; Vilhuber, Lars; Wikle, Christopher The National Science Foundation-Census Bureau Research Network (NCRN) was established in 2011 to create interdisciplinary research nodes on methodological questions of interest and significance to the broader research community and to the Federal Statistical System (FSS), particularly the Census Bureau. The activities to date have covered both fundamental and applied statistical research and have focused at least in part on the training of current and future generations of researchers in skills of relevance to surveys and alternative measurement of economic units, households, and persons. This paper discusses some of the key research findings of the eight nodes, organized into six topics: (1) Improving census and survey data collection methods; (2) Using alternative sources of data; (3) Protecting privacy and confidentiality by improving disclosure avoidance; (4) Using spatial and spatio-temporal statistical modeling to improve estimates; (5) Assessing data cost and quality tradeoffs; and (6) Combining information from multiple sources. It also reports on collaborations across nodes and with federal agencies, new software developed, and educational activities and outcomes. The paper concludes with an evaluation of the ability of the FSS to apply the NCRN{\textquoteright}s research outcomes and suggests some next steps, as well as the implications of this research-network model for future federal government renewal initiatives. This paper began as a May 8, 2015 presentation to the National Academies of Science{\textquoteright}s Committee on National Statistics by two of the principal investigators of the National Science Foundation-Census Bureau Research Network (NCRN) {\textendash} John Abowd and the late Steve Fienberg (Carnegie Mellon University). The authors acknowledge the contributions of the other principal investigators of the NCRN who are not co-authors of the paper (William Block, William Eddy, Alan Karr, Charles Manski, Nicholas Nagle, and Rebecca Nugent), the co- principal investigators, and the comments of Patrick Cantwell, Constance Citro, Adam Eck, Brian Harris-Kojetin, and Eloise Parker. We note with sorrow the deaths of Stephen Fienberg and Allan McCutcheon, two of the original NCRN principal investigators. The principal investigators also wish to acknowledge Cheryl Eavey{\textquoteright}s sterling grant administration on behalf of the NSF. The conclusions reached in this paper are not the responsibility of the National Science Foundation (NSF), the Census Bureau, or any of the institutions to which the authors belong

}, url = {http://hdl.handle.net/1813/52650}, author = {Weinberg, Daniel and Abowd, John M. and Belli, Robert F. and Cressie, Noel and Folch, David C. and Holan, Scott H. and Levenstein, Margaret C. and Olson, Kristen M. and Reiter, Jerome P. and Shapiro, Matthew D. and Smyth, Jolene and Soh, Leen-Kiat and Spencer, Bruce and Spielman, Seth E. and Vilhuber, Lars and Wikle, Christopher} } @techreport {handle:1813:52164, title = {Formal Privacy Models and Title 13}, number = {1813:52164}, year = {2017}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {Formal Privacy Models and Title 13 Nissim, Kobbi; Gasser, Urs; Smith, Adam; Vadhan, Salil; O{\textquoteright}Brien, David; Wood, Alexandra A new collaboration between academia and the Census Bureau to further the Bureau{\textquoteright}s use of formal privacy models.}, url = {http://hdl.handle.net/1813/52164}, author = {Nissim, Kobbi and Gasser, Urs and Smith, Adam and Vadhan, Salil and O{\textquoteright}Brien, David and Wood, Alexandra} } @techreport {handle:1813:52164, title = {NCRN Meeting Spring 2017: Formal Privacy Models and Title 13}, number = {1813:52164}, year = {2017}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2017: Formal Privacy Models and Title 13 Nissim, Kobbi; Gasser, Urs; Smith, Adam; Vadhan, Salil; O{\textquoteright}Brien, David; Wood, Alexandra A new collaboration between academia and the Census Bureau to further the Bureau{\textquoteright}s use of formal privacy models.}, url = {http://hdl.handle.net/1813/52164}, author = {Nissim, Kobbi and Gasser, Urs and Smith, Adam and Vadhan, Salil and O{\textquoteright}Brien, David and Wood, Alexandra} } @article {2657, title = {Regionalization of Multiscale Spatial Processes using a Criterion for Spatial Aggregation Error}, journal = {Journal of the Royal Statistical Society -- Series B.}, year = {2017}, abstract = {The modifiable areal unit problem and the ecological fallacy are known problems that occur when modeling multiscale spatial processes. We investigate how these forms of spatial aggregation error can guide a regionalization over a spatial domain of interest. By "regionalization" we mean a specification of geographies that define the spatial support for areal data. This topic has been studied vigorously by geographers, but has been given less attention by spatial statisticians. Thus, we propose a criterion for spatial aggregation error (CAGE), which we minimize to obtain an optimal regionalization. To define CAGE we draw a connection between spatial aggregation error and a new multiscale representation of the Karhunen-Loeve (K-L) expansion. This relationship between CAGE and the multiscale K-L expansion leads to illuminating theoretical developments including: connections between spatial aggregation error, squared prediction error, spatial variance, and a novel extension of Obled-Creutin eigenfunctions. The effectiveness of our approach is demonstrated through an analysis of two datasets, one using the American Community Survey and one related to environmental ocean winds.}, keywords = {American Community Survey, empirical orthogonal functions, MAUP, Reduced rank, Spatial basis functions, Survey data}, url = {https://arxiv.org/abs/1502.01974}, author = {Bradley, J.R. and Wikle, C.K. and Holan, S.H.} } @article {2660, title = {Visualizing uncertainty in areal data estimates with bivariate choropleth maps, map pixelation, and glyph rotation}, journal = {Stat}, volume = {6}, year = {2017}, pages = {292{\textendash}302}, abstract = {In statistics, we quantify uncertainty to help determine the accuracy of estimates, yet this crucial piece of information is rarely included on maps visualizing areal data estimates. We develop and present three approaches to include uncertainty on maps: (1) the bivariate choropleth map repurposed to visualize uncertainty; (2) the pixelation of counties to include values within an estimate{\textquoteright}s margin of error; and (3) the rotation of a glyph, located at a county{\textquoteright}s centroid, to represent an estimate{\textquoteright}s uncertainty. The second method is presented as both a static map and visuanimation. We use American Community Survey estimates and their corresponding margins of error to demonstrate the methods and highlight the importance of visualizing uncertainty in areal data. An extensive online supplement provides the R code necessary to produce the maps presented in this article as well as alternative versions of them. }, url = {http://onlinelibrary.wiley.com/doi/10.1002/sta4.150/abstract}, author = {Lucchesi, L.R. and Wikle, C.K.} } @article {2665, title = {Bayesian Hierarchical Models with Conjugate Full-Conditional Distributions for Dependent Data from the Natural Exponential Family}, journal = {Journal of the American Statistical Association - T\&M.}, year = {2016}, abstract = {We introduce a Bayesian approach for analyzing (possibly) high-dimensional dependent data that are distributed according to a member from the natural exponential family of distributions. This problem requires extensive methodological advancements, as jointly modeling high-dimensional dependent data leads to the so-called "big n problem." The computational complexity of the "big n problem" is further exacerbated when allowing for non-Gaussian data models, as is the case here. Thus, we develop new computationally efficient distribution theory for this setting. In particular, we introduce something we call the "conjugate multivariate distribution," which is motivated by the univariate distribution introduced in Diaconis and Ylvisaker (1979). Furthermore, we provide substantial theoretical and methodological development including: results regarding conditional distributions, an asymptotic relationship with the multivariate normal distribution, conjugate prior distributions, and full-conditional distributions for a Gibbs sampler. The results in this manuscript are extremely general, and can be adapted to many different settings. We demonstrate the proposed methodology through simulated examples and analyses based on estimates obtained from the US Census Bureaus{\textquoteright} American Community Survey (ACS).}, url = {https://arxiv.org/abs/1701.07506}, author = {Bradley, J.R. and Holan, S.H. and Wikle, C.K.} } @article {2668, title = {Bayesian Lattice Filters for Time-Varying Autoregression and Time-Frequency Analysis}, journal = {Bayesian Analysis}, year = {2016}, pages = {977-1003}, abstract = {Modeling nonstationary processes is of paramount importance to many scientific disciplines including environmental science, ecology, and finance, among others. Consequently, flexible methodology that provides accurate estimation across a wide range of processes is a subject of ongoing interest. We propose a novel approach to model-based time-frequency estimation using time-varying autoregressive models. In this context, we take a fully Bayesian approach and allow both the autoregressive coefficients and innovation variance to vary over time. Importantly, our estimation method uses the lattice filter and is cast within the partial autocorrelation domain. The marginal posterior distributions are of standard form and, as a convenient by-product of our estimation method, our approach avoids undesirable matrix inversions. As such, estimation is extremely computationally efficient and stable. To illustrate the effectiveness of our approach, we conduct a comprehensive simulation study that compares our method with other competing methods and find that, in most cases, our approach performs superior in terms of average squared error between the estimated and true time-varying spectral density. Lastly, we demonstrate our methodology through three modeling applications; namely, insect communication signals, environmental data (wind components), and macroeconomic data (US gross domestic product (GDP) and consumption).}, url = {https://arxiv.org/abs/1408.2757}, author = {Yang, W.H. and Holan, S.H. and Wikle, C.K.} } @article {2666, title = {Bayesian Spatial Change of Support for Count-Valued Survey Data with Application to the American Community Survey}, journal = {Journal of the American Statistical Association}, year = {2016}, pages = {472-487}, abstract = {We introduce Bayesian spatial change of support methodology for count-valued survey data with known survey variances. Our proposed methodology is motivated by the American Community Survey (ACS), an ongoing survey administered by the U.S. Census Bureau that provides timely information on several key demographic variables. Specifically, the ACS produces 1-year, 3-year, and 5-year "period-estimates," and corresponding margins of errors, for published demographic and socio-economic variables recorded over predefined geographies within the United States. Despite the availability of these predefined geographies it is often of interest to data users to specify customized user-defined spatial supports. In particular, it is useful to estimate demographic variables defined on "new" spatial supports in "real-time." This problem is known as spatial change of support (COS), which is typically performed under the assumption that the data follows a Gaussian distribution. However, count-valued survey data is naturally non-Gaussian and, hence, we consider modeling these data using a Poisson distribution. Additionally, survey-data are often accompanied by estimates of error, which we incorporate into our analysis. We interpret Poisson count-valued data in small areas as an aggregation of events from a spatial point process. This approach provides us with the flexibility necessary to allow ACS users to consider a variety of spatial supports in "real-time." We demonstrate the effectiveness of our approach through a simulated example as well as through an analysis using public-use ACS data.}, url = {https://arxiv.org/abs/1405.7227}, author = {Bradley, J.R. and Wikle, C.K. and Holan, S.H.} } @article {2667, title = {Generating Partially Synthetic Geocoded Public Use Data with Decreased Disclosure Risk Using Differential Smoothing}, journal = {Journal of the Royal Statistical Society - Series A}, year = {2016}, abstract = {When collecting geocoded confidential data with the intent to disseminate, agencies often resort to altering the geographies prior to making data publicly available due to data privacy obligations. An alternative to releasing aggregated and/or perturbed data is to release multiply-imputed synthetic data, where sensitive values are replaced with draws from statistical models designed to capture important distributional features in the collected data. One issue that has received relatively little attention, however, is how to handle spatially outlying observations in the collected data, as common spatial models often have a tendency to overfit these observations. The goal of this work is to bring this issue to the forefront and propose a solution, which we refer to as "differential smoothing." After implementing our method on simulated data, highlighting the effectiveness of our approach under various scenarios, we illustrate the framework using data consisting of sale prices of homes in San Francisco.}, url = {https://arxiv.org/abs/1507.05529}, author = {Quick, H. and Holan, S.H. and Wikle, C.K.} } @article {2669, title = {Multivariate Spatio-Temporal Survey Fusion with Application to the American Community Survey and Local Area Unemployment Statistics}, journal = {Stat}, year = {2016}, pages = {224 - 233}, abstract = {There are often multiple surveys available that estimate and report related demographic variables of interest that are referenced over space and/or time. Not all surveys produce the same information, and thus, combining these surveys typically leads to higher quality estimates. That is, not every survey has the same level of precision nor do they always provide estimates of the same variables. In addition, various surveys often produce estimates with incomplete spatio-temporal coverage. By combining surveys using a Bayesian approach, we can account for different margins of error and leverage dependencies to produce estimates of every variable considered at every spatial location and every time point. Specifically, our strategy is to use a hierarchical modelling approach, where the first stage of the model incorporates the margin of error associated with each survey. Then, in a lower stage of the hierarchical model, the multivariate spatio-temporal mixed effects model is used to incorporate multivariate spatio-temporal dependencies of the processes of interest. We adopt a fully Bayesian approach for combining surveys; that is, given all of the available surveys, the conditional distributions of the latent processes of interest are used for statistical inference. To demonstrate our proposed methodology, we jointly analyze period estimates from the US Census Bureau{\textquoteright}s American Community Survey, and estimates obtained from the Bureau of Labor Statistics Local Area Unemployment Statistics program. Copyright {\textcopyright} 2016 John Wiley \& Sons, Ltd.}, url = {http://onlinelibrary.wiley.com/doi/10.1002/sta4.120/full}, author = {Bradley, J.R. and Holan, S.H. and Wikle, C.K} } @techreport {handle:1813:45821, title = {NCRN Meeting Fall 2016: Scanner Data and Economic Statistics: A Unified Approach}, number = {1813:45821}, year = {2016}, institution = {University of Michigan}, type = {Preprint}, abstract = {NCRN Meeting Fall 2016: Scanner Data and Economic Statistics: A Unified Approach Redding, Stephen J.; Weinstein, David E.}, url = {http://hdl.handle.net/1813/45821}, author = {Redding, Stephen J. and Weinstein, David E.} } @techreport {handle:1813:43895, title = {NCRN Meeting Spring 2016: Developing job linkages for the Health and Retirement Study}, number = {1813:43895}, year = {2016}, institution = {University of Michigan}, type = {Preprint}, abstract = {NCRN Meeting Spring 2016: Developing job linkages for the Health and Retirement Study McCue, Kristin; Abowd, John; Levenstein, Margaret; Patki, Dhiren; Rodgers, Ann; Shapiro, Matthew; Wasi, Nada This paper documents work using probabilistic record linkage to create a crosswalk between jobs reported in the Health and Retirement Study (HRS) and the list of workplaces on Census Bureau{\textquoteright}s Business Register. Matching job records provides an opportunity to join variables that occur uniquely in separate datasets, to validate responses, and to develop missing data imputation models. Identifying the respondent{\textquoteright}s workplace ({\textquotedblleft}establishment{\textquotedblright}) is valuable for HRS because it allows researchers to incorporate the effects of particular social, economic, and geospatial work environments in studies of respondent health and retirement behavior. The linkage makes use of name and address standardizing techniques tailored to business data that were recently developed in a collaboration between researchers at Census, Cornell, and the University of Michigan. The matching protocol makes no use of the identity of the HRS respondent and strictly protects the confidentiality of information about the respondent{\textquoteright}s employer. The paper first describes the clerical review process used to create a set of human-reviewed candidate pairs, and use of that set to train matching models. It then describes and compares several linking strategies that make use of employer name, address, and phone number. Finally it discusses alternative ways of incorporating information on match uncertainty into estimates based on the linked data, and illustrates their use with a preliminary sample of matched HRS jobs. Presented at the NCRN Meeting Spring 2016 in Washington DC on May 9-10, 2016; see http://www.ncrn.info/event/ncrn-spring-2016-meeting}, url = {http://hdl.handle.net/1813/43895}, author = {Mccue, Kristin and Abowd, John and Levenstein, Margaret and Patki, Dhiren and Rodgers, Ann and Shapiro, Matthew and Wasi, Nada} } @article {2243, title = {Releasing synthetic magnitude micro data constrained to fixed marginal totals}, journal = {Statistical Journal of the International Association for Official Statistics}, volume = {32}, year = {2016}, month = {02/2016}, pages = {93-108}, chapter = {93}, abstract = {We present approaches to generating synthetic microdata for multivariate data that take on non-negative integer values, such as magnitude data in economic surveys. The basic idea is to estimate a mixture of Poisson distributions to describe the multivariate distribution, and release draws from the posterior predictive distribution of the model. We develop approaches that guarantee the synthetic data sum to marginal totals computed from the original data, as well approaches that do not enforce this equality. For both cases, we present methods for assessing disclosure risks inherent in releasing synthetic magnitude microdata. We illustrate the methodology using economic data from a survey of manufacturing establishments.}, keywords = {Confidential, Disclosure, establishment, mixture, poisson, risk}, doi = {10.3233/SJI-160959}, url = {http://content.iospress.com/download/statistical-journal-of-the-iaos/sji959}, author = {Wei, Lan and Reiter, Jerome P.} } @article {1739, title = {Bayesian Analysis of Spatially-Dependent Functional Responses with Spatially-Dependent Multi-Dimensional Functional Predictors}, journal = {Statistica Sinica}, volume = {25}, year = {2015}, chapter = {205-223}, doi = {10.5705/ss.2013.245w }, url = {http://www3.stat.sinica.edu.tw/preprint/SS-13-245w_Preprint.pdf}, author = {Yang, W. H. and Wikle, C.K. and Holan, S.H. and Sudduth, K. and Meyers, D.B.} } @article {1741, title = {Bayesian Binomial Mixture Models for Estimating Abundance in Ecological Monitoring Studies}, journal = {Annals of Applied Statistics}, volume = {9}, year = {2015}, pages = {1-26}, doi = {10.1214/14-AOAS801}, url = {http://projecteuclid.org/euclid.aoas/1430226082}, author = {Wu, G. and Holan, S.H. and Nilon, C.H. and Wikle, C.K.} } @article {2015arXiv:1408.2757, title = {Bayesian Lattice Filters for Time-Varying Autoregression and Time-Frequency Analysis}, journal = {ArXiv}, number = {TEST 2}, year = {2015}, abstract = {Modeling nonstationary processes is of paramount importance to many scientific disciplines including environmental science, ecology, and finance, among others. Consequently, flexible methodology that provides accurate estimation across a wide range of processes is a subject of ongoing interest. We propose a novel approach to model-based time-frequency estimation using time-varying autoregressive models. In this context, we take a fully Bayesian approach and allow both the autoregressive coefficients and innovation variance to vary over time. Importantly, our estimation method uses the lattice filter and is cast within the partial autocorrelation domain. The marginal posterior distributions are of standard form and, as a convenient by-product of our estimation method, our approach avoids undesirable matrix inversions. As such, estimation is extremely computationally efficient and stable. To illustrate the effectiveness of our approach, we conduct a comprehensive simulation study that compares our method with other competing methods and find that, in most cases, our approach performs superior in terms of average squared error between the estimated and true time-varying spectral density. Lastly, we demonstrate our methodology through three modeling applications; namely, insect communication signals, environmental data (wind components), and macroeconomic data (US gross domestic product (GDP) and consumption).}, url = {http://arxiv.org/abs/1408.2757}, author = {Yang, W.~H. and Holan, S.~H. and Wikle, C.K.} } @article {2221, title = {Bayesian Lattice Filters for Time-Varying Autoregression and Time{\textendash}Frequency Analysis}, journal = {Project Euclid}, year = {2015}, month = {10/2015}, pages = {27}, abstract = {Modeling nonstationary processes is of paramount importance to many scientific disciplines including environmental science, ecology, and finance, among others. Consequently, flexible methodology that provides accurate estimation across a wide range of processes is a subject of ongoing interest. We propose a novel approach to model-based time{\textendash}frequency estimation using time-varying autoregressive models. In this context, we take a fully Bayesian approach and allow both the autoregressive coefficients and innovation variance to vary over time. Importantly, our estimation method uses the lattice filter and is cast within the partial autocorrelation domain. The marginal posterior distributions are of standard form and, as a convenient by-product of our estimation method, our approach avoids undesirable matrix inversions. As such, estimation is extremely computationally efficient and stable. To illustrate the effectiveness of our approach, we conduct a comprehensive simulation study that compares our method with other competing methods and find that, in most cases, our approach performs superior in terms of average squared error between the estimated and true time-varying spectral density. Lastly, we demonstrate our methodology through three modeling applications; namely, insect communication signals, environmental data (wind components), and macroeconomic data (US gross domestic product (GDP) and consumption).}, keywords = {locally stationary, model selection, nonstationary partial autocorrelation, piecewise stationary, sequential estimation, time-varying spectral density}, doi = {10.1214/15-BA978}, url = {http://projecteuclid.org/euclid.ba/1445263834}, author = {Yang, W.~H. and Holan, Scott H. and Wikle, Christopher K.} } @article {2039, title = {Bayesian Marked Point Process Modeling for Generating Fully Synthetic Public Use Data with Point-Referenced Geography}, journal = {Spatial Statistics}, volume = {14}, year = {2015}, month = {08/2015}, pages = {439--451}, doi = {10.1016/j.spasta.2015.07.008}, url = {http://www.sciencedirect.com/science/article/pii/S2211675315000718}, author = {Quick, Harrison and Holan, Scott H. and Wikle, Christopher K. and Reiter, Jerome P.} } @article {2015arXiv:1407.7795, title = {Bayesian Marked Point Process Modeling for Generating Fully Synthetic Public Use Data with Point-Referenced Geography}, journal = {ArXiv}, year = {2015}, abstract = {Many data stewards collect confidential data that include fine geography. When sharing these data with others, data stewards strive to disseminate data that are informative for a wide range of spatial and non-spatial analyses while simultaneously protecting the confidentiality of data subjects{\textquoteright} identities and attributes. Typically, data stewards meet this challenge by coarsening the resolution of the released geography and, as needed, perturbing the confidential attributes. When done with high intensity, these redaction strategies can result in released data with poor analytic quality. We propose an alternative dissemination approach based on fully synthetic data. We generate data using marked point process models that can maintain both the statistical properties and the spatial dependence structure of the confidential data. We illustrate the approach using data consisting of mortality records from Durham, North Carolina.}, url = {http://arxiv.org/abs/1407.7795}, author = {Quick, H. and Holan, S.~H. and Wikle, C.~K. and Reiter, J.~P.} } @article {2088, title = {Bayesian Semiparametric Hierarchical Empirical Likelihood Spatial Models}, journal = {Journal of Statistical Planning and Inference}, volume = {165}, year = {2015}, month = {10/2015}, pages = {78-90}, issn = {0378-3758}, doi = {10.1016/j.jspi.2015.04.002}, author = {Porter, A.T. and Holan, S.H. and Wikle, C.K.} } @article {2204, title = {Bayesian Spatial Change of Support for Count-Valued Survey Data with Application to the American Community Survey}, journal = {Journal of the American Statistical Association}, year = {2015}, month = {12/2015}, abstract = {We introduce Bayesian spatial change of support methodology for count-valued survey data with known survey variances. Our proposed methodology is motivated by the American Community Survey (ACS), an ongoing survey administered by the U.S. Census Bureau that provides timely information on several key demographic variables. Specifically, the ACS produces 1-year, 3-year, and 5-year {\textquotedblleft}period-estimates,{\textquotedblright} and corresponding margins of errors, for published demographic and socio-economic variables recorded over predefined geographies within the United States. Despite the availability of these predefined geographies it is often of interest to data-users to specify customized user-defined spatial supports. In particular, it is useful to estimate demographic variables defined on {\textquotedblleft}new{\textquotedblright} spatial supports in {\textquotedblleft}real-time.{\textquotedblright} This problem is known as spatial change of support (COS), which is typically performed under the assumption that the data follows a Gaussian distribution. However, count-valued survey data is naturally non-Gaussian and, hence, we consider modeling these data using a Poisson distribution. Additionally, survey-data are often accompanied by estimates of error, which we incorporate into our analysis. We interpret Poisson count-valued data in small areas as an aggregation of events from a spatial point process. This approach provides us with the flexibility necessary to allow ACS users to consider a variety of spatial supports in {\textquotedblleft}real-time.{\textquotedblright} We show the effectiveness of our approach through a simulated example as well as through an analysis using public-use ACS data.}, doi = {10.1080/01621459.2015.1117471}, url = {http://www.tandfonline.com/doi/abs/10.1080/01621459.2015.1117471}, author = {Bradley, Jonathan and Wikle, C.K. and Holan, S.~H.} } @article {2219, title = {Bayesian Spatial Change of Support for Count-Valued Survey Data with Application to the American Community Survey}, journal = {Journal of the American Statistical Association}, year = {2015}, month = {12/2015}, abstract = {We introduce Bayesian spatial change of support methodology for count-valued survey data with known survey variances. Our proposed methodology is motivated by the American Community Survey (ACS), an ongoing survey administered by the U.S. Census Bureau that provides timely information on several key demographic variables. Specifically, the ACS produces 1-year, 3-year, and 5-year {\textquotedblleft}period-estimates,{\textquotedblright} and corresponding margins of errors, for published demographic and socio-economic variables recorded over predefined geographies within the United States. Despite the availability of these predefined geographies it is often of interest to data-users to specify customized user-defined spatial supports. In particular, it is useful to estimate demographic variables defined on {\textquotedblleft}new{\textquotedblright} spatial supports in {\textquotedblleft}real-time.{\textquotedblright} This problem is known as spatial change of support (COS), which is typically performed under the assumption that the data follows a Gaussian distribution. However, count-valued survey data is naturally non-Gaussian and, hence, we consider modeling these data using a Poisson distribution. Additionally, survey-data are often accompanied by estimates of error, which we incorporate into our analysis. We interpret Poisson count-valued data in small areas as an aggregation of events from a spatial point process. This approach provides us with the flexibility necessary to allow ACS users to consider a variety of spatial supports in {\textquotedblleft}real-time.{\textquotedblright} We show the effectiveness of our approach through a simulated example as well as through an analysis using public-use ACS data.}, keywords = {Aggregation, American Community Survey, Bayesian hierarchical model, Givens angle prior, Markov chain Monte Carlo, Multiscale model, Non-Gaussian.}, doi = {10.1080/01621459.2015.1117471}, url = {http://www.tandfonline.com/doi/abs/10.1080/01621459.2015.1117471}, author = {Bradley, Jonathan R. and Wikle, Christopher K. and Holan, Scott H.} } @article {2015arXiv:1405.7227, title = {Bayesian Spatial Change of Support for Count{\textendash}Valued Survey Data}, journal = {ArXiv}, year = {2015}, abstract = {We introduce Bayesian spatial change of support methodology for count-valued survey data with known survey variances. Our proposed methodology is motivated by the American Community Survey (ACS), an ongoing survey administered by the U.S. Census Bureau that provides timely information on several key demographic variables. Specifically, the ACS produces 1-year, 3-year, and 5-year "period-estimates," and corresponding margins of errors, for published demographic and socio-economic variables recorded over predefined geographies within the United States. Despite the availability of these predefined geographies it is often of interest to data users to specify customized user-defined spatial supports. In particular, it is useful to estimate demographic variables defined on "new" spatial supports in "real-time." This problem is known as spatial change of support (COS), which is typically performed under the assumption that the data follows a Gaussian distribution. However, count-valued survey data is naturally non-Gaussian and, hence, we consider modeling these data using a Poisson distribution. Additionally, survey-data are often accompanied by estimates of error, which we incorporate into our analysis. We interpret Poisson count-valued data in small areas as an aggregation of events from a spatial point process. This approach provides us with the flexibility necessary to allow ACS users to consider a variety of spatial supports in "real-time." We demonstrate the effectiveness of our approach through a simulated example as well as through an analysis using public-use ACS data.}, url = {http://arxiv.org/abs/1405.7227}, author = {Bradley, J.~R. and Wikle, C.K. and Holan, S.~H.} } @article {1825, title = {Change in Visible Impervious Surface Area in Southeastern Michigan Before and After the {\textquotedblleft}Great Recession:{\textquotedblright} Spatial Differentiation in Remotely Sensed Land-Cover Dynamics}, journal = {Population and Environment}, volume = {36}, year = {2015}, month = {03/2015}, pages = {331-355}, chapter = {331}, doi = {10.1007/s11111-014-0219-y}, url = {http://link.springer.com/article/10.1007\%2Fs11111-014-0219-y}, author = {Wilson, C. R. and Brown, D. G.} } @article {1883, title = {Comment on {\textquoteleft}{\textquoteleft}Semiparametric Bayesian Density Estimation with Disparate Data Sources: A Meta-Analysis of Global Childhood Undernutrition" by Finncane, M. M., Paciorek, C. J., Stevens, G. A., and Ezzati, M.}, journal = {Journal of the American Statistical Association}, year = {2015}, author = {Wikle, C.K. and Holan, S.H.} } @conference {2120, title = {Determining Potential for Breakoff in Time Diary Survey Using Paradata}, booktitle = {70th Annual Conference of the American Association for Public Opinion Research (AAPOR)}, year = {2015}, month = {05/2015}, address = {Hollywood, Florida}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Wettlaufer, D. and Arunachalam, H. and Atkin, G. and Eck, A. and Soh, L.-K. and Belli, R.F.} } @article {2040, title = {Dirichlet Process Mixture Models for Nested Categorical Data}, journal = {ArXiv}, year = {2015}, abstract = {We present a Bayesian model for estimating the joint distribution of multivariate categorical data when units are nested within groups. Such data arise frequently in social science settings, for example, people living in households. The model assumes that (i) each group is a member of a group-level latent class, and (ii) each unit is a member of a unit-level latent class nested within its group-level latent class. This structure allows the model to capture dependence among units in the same group. It also facilitates simultaneous modeling of variables at both group and unit levels. We develop a version of the model that assigns zero probability to groups and units with physically impossible combinations of variables. We apply the model to estimate multivariate relationships in a subset of the American Community Survey. Using the estimated model, we generate synthetic household data that could be disseminated as redacted public use files with high analytic validity and low disclosure risks. Supplementary materials for this article are available online.}, url = {http://arxiv.org/pdf/1412.2282v3.pdf}, author = {Hu, J. and Reiter, J.P. and Wang, Q.} } @article {1824, title = {Expanding the Discourse on Antipoverty Policy: Reconsidering a Negative Income Tax}, journal = {Journal of Poverty}, volume = {19}, year = {2015}, month = {02/2015}, pages = {218-238}, abstract = {This article proposes that advocates for the poor consider the replacement of the current means-tested safety net in the United States with a Negative Income Tax (NIT), a guaranteed income program that lifts families{\textquoteright} incomes above a minimum threshold. The article highlights gaps in service provision that leave millions in poverty, explains how a NIT could help fill those gaps, and compares current expenditures on major means-tested programs to estimated expenditures necessary for a NIT. Finally, it addresses the financial and political concerns that are likely to arise in the event that a NIT proposal gains traction among policy makers.}, keywords = {economic well-being, poverty alleviation, public policy, social welfare policy}, doi = {10.1080/10875549.2014.991889}, url = {http://dx.doi.org/10.1080/10875549.2014.991889}, author = {Jessica Wiederspan and Elizabeth Rhodes and H. Luke Shaefer} } @conference {2119, title = {Grids and Online Panels: A Comparison of Device Type from a Survey Quality Perspective}, booktitle = {70th Annual Conference of the American Association for Public Opinion Research (AAPOR)}, year = {2015}, address = {Hollywood, Florida}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Wang, Mengyang and McCutcheon, Allan L. and Allen, Laura} } @inbook {2092, title = {Hierarchcial models for uncertainty quantification: An overview}, booktitle = {Handbook of Uncertainty Quantification}, year = {2015}, publisher = {Springer}, organization = {Springer}, issn = {978-3-319-12384-4}, author = {Wikle, C.K.}, editor = {Ghanem, R. and Higdon, D. and Owhadi, H.} } @inbook {WikleHooten2015, title = {Hierarchical Agent-Based Spatio-Temporal Dynamic Models for Discrete Valued Data}, booktitle = {Handbook of Discrete-Valued Time Series}, year = {2015}, publisher = {Chapman and Hall/CRC Press}, organization = {Chapman and Hall/CRC Press}, chapter = {Hierarchical Agent-Based Spatio-Temporal Dynamic Models for Discrete Valued Data}, address = {Boca Raton, FL.}, issn = {9781466577732}, url = {http://www.crcpress.com/product/isbn/9781466577732}, author = {Wikle, C.K. and Hooten, M.B.}, editor = {Davis, R. and Holan, S. and Lund, R. and Ravishanker, N.} } @inbook {HolanWikle, title = {Hierarchical Dynamic Generalized Linear Mixed Models for Discrete-Valued Spatio-Temporal Data}, booktitle = {Handbook of Discrete-Valued Time Series}, year = {2015}, note = {to appear in "Handbook of Discrete-Valued Time Series}, publisher = {Chapman and Hall/CRC Press}, organization = {Chapman and Hall/CRC Press}, address = {Boca Raton, FL}, isbn = {ISBN 9781466577732}, url = {http://www.crcpress.com/product/isbn/9781466577732}, author = {Holan, S.H. and Wikle, C.K.}, editor = {Davis, R. and Holan, S. and Lund, R. and Ravishanker, N} } @inbook {1879, title = {Hierarchical Dynamic Generalized Linear Mixed Models for Discrete--Valued Spatio-Temporal Data}, booktitle = {Handbook of Discrete--Valued Time Series}, year = {2015}, author = {Holan, S.H. and Wikle, C.K.} } @inbook {2093, title = {Hierarchical Spatial Models}, booktitle = {Encyclopedia of Geographical Information Science}, year = {2015}, publisher = {Springer}, organization = {Springer}, author = {Arab, A. and Hooten, M.B. and Wikle, C.K.} } @article {2090, title = {Hierarchical, stochastic modeling across spatiotemporal scales of large river ecosystems and somatic growth in fish populations under various climate models: Missouri River sturgeon example}, journal = {Geological Society}, year = {2015}, author = {Wildhaber, M.L. and Wikle, C.K. and Moran, E.H. and Anderson, C.J. and Franz, K.J. and Dey, R.} } @conference {2103, title = {I Know What You Did Next: Predicting Respondent{\textquoteright}s Next Activity Using Machine Learning}, booktitle = {70th Annual Conference of the American Association for Public Opinion Research (AAPOR)}, year = {2015}, month = {May 14-17, 2015}, address = {Hollywood, Florida}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Arunachalam, H. and Atkin, G. and Eck, A. and Wettlaufer, D. and Soh, L.-K. and Belli, R.F.} } @article {Wikletoappear, title = {Modern Perspectives on Statistics for Spatio-Temporal Data}, journal = {WIRES Computational Statistics}, volume = {7}, year = {2015}, pages = {86-98}, issn = {1939-0068}, doi = {10.1002/wics.1341}, url = {http://dx.doi.org/10.1002/wics.1341}, author = {Wikle, C.K.} } @article {1882, title = {Multiscale Analysis of Survey Data: Recent Developments and Exciting Prospects}, journal = {Statistics Views}, year = {2015}, author = {Bradley, J.R. and Wikle, C.K. and Holan, S.H.} } @article {2089, title = {Multivariate Spatial Hierarchical Bayesian Empirical Likelihood Methods for Small Area Estimation}, journal = {STAT}, volume = {4}, year = {2015}, month = {05/2015}, pages = {108-116}, issn = {2049-1573}, doi = {10.1002/sta4.81}, url = {http://dx.doi.org/10.1002/sta4.81}, author = {Porter, A.T. and Holan, S.H. and Wikle, C.K.} } @article {2015arXiv:1503.00982, title = {Multivariate Spatio-Temporal Models for High-Dimensional Areal Data with Application to Longitudinal Employer-Household Dynamics}, journal = {ArXiv}, year = {2015}, abstract = {Many data sources report related variables of interest that are also referenced over geographic regions and time; however, there are relatively few general statistical methods that one can readily use that incorporate these multivariate spatio-temporal dependencies. Additionally, many multivariate spatio-temporal areal datasets are extremely high-dimensional, which leads to practical issues when formulating statistical models. For example, we analyze Quarterly Workforce Indicators (QWI) published by the US Census Bureau{\textquoteright}s Longitudinal Employer-Household Dynamics (LEHD) program. QWIs are available by different variables, regions, and time points, resulting in millions of tabulations. Despite their already expansive coverage, by adopting a fully Bayesian framework, the scope of the QWIs can be extended to provide estimates of missing values along with associated measures of uncertainty. Motivated by the LEHD, and other applications in federal statistics, we introduce the multivariate spatio-temporal mixed effects model (MSTM), which can be used to efficiently model high-dimensional multivariate spatio-temporal areal datasets. The proposed MSTM extends the notion of Moran{\textquoteright}s I basis functions to the multivariate spatio-temporal setting. This extension leads to several methodological contributions including extremely effective dimension reduction, a dynamic linear model for multivariate spatio-temporal areal processes, and the reduction of a high-dimensional parameter space using {a novel} parameter model.}, url = {http://arxiv.org/abs/1503.00982}, author = {Bradley, J.~R. and Holan, S.~H. and Wikle, C.K.} } @article {2169, title = {Multivariate Spatio-Temporal Models for High-Dimensional Areal Data with Application to Longitudinal Employer-Household Dynamics}, journal = {Annals of Applied Statistics}, volume = {9}, year = {2015}, month = {03/2015}, abstract = {Many data sources report related variables of interest that are also referenced over geographic regions and time; however, there are relatively few general statistical methods that one can readily use that incorporate these multivariate spatio-temporal dependencies. Additionally, many multivariate spatio-temporal areal datasets are extremely high-dimensional, which leads to practical issues when formulating statistical models. For example, we analyze Quarterly Workforce Indicators (QWI) published by the US Census Bureau{\textquoteright}s Longitudinal Employer-Household Dynamics (LEHD) program. QWIs are available by different variables, regions, and time points, resulting in millions of tabulations. Despite their already expansive coverage, by adopting a fully Bayesian framework, the scope of the QWIs can be extended to provide estimates of missing values along with associated measures of uncertainty. Motivated by the LEHD, and other applications in federal statistics, we introduce the multivariate spatio-temporal mixed effects model (MSTM), which can be used to efficiently model high-dimensional multivariate spatio-temporal areal datasets. The proposed MSTM extends the notion of Moran{\textquoteright}s I basis functions to the multivariate spatio-temporal setting. This extension leads to several methodological contributions including extremely effective dimension reduction, a dynamic linear model for multivariate spatio-temporal areal processes, and the reduction of a high-dimensional parameter space using a novel parameter model.}, doi = {0.1214/15-AOAS862}, author = {Bradley, J.R. and Holan, S.H. and Wikle, C.K.} } @techreport {handle:1813:40176, title = {NCRN Meeting Spring 2015: Models for Multiscale Spatially-Referenced Count Data}, number = {1813:40176}, year = {2015}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2015: Models for Multiscale Spatially-Referenced Count Data Holan, Scott; Bradley, Jonathan R.; Wikle, Christopher K. Presentation at the NCRN Meeting Spring 2015}, url = {http://hdl.handle.net/1813/40176}, author = {Holan, Scott and Bradley, Jonathan R. and Wikle, Christopher K.} } @techreport {handle:1813:40177, title = {NCRN Meeting Spring 2015: Regionalization of Multiscale Spatial Processes Using a Criterion for Spatial Aggregation Error}, number = {1813:40177}, year = {2015}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2015: Regionalization of Multiscale Spatial Processes Using a Criterion for Spatial Aggregation Error Wikle, Christopher K.; Bradley, Jonathan; Holan, Scott Develop and implement a statistical criterion to diagnose spatial aggregation error that can facilitate the choice of regionalizations of spatial data. Presentation at NCRN Meeting Spring 2015}, url = {http://hdl.handle.net/1813/40177}, author = {Wikle, Christopher K. and Bradley, Jonathan and Holan, Scott} } @techreport {handle:1813:40179, title = {NCRN Meeting Spring 2015: Training Undergraduates, Graduate Students, Postdocs, and Federal Agencies: Methodology, Data, and Science for Federal Statistics}, number = {1813:40179}, year = {2015}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2015: Training Undergraduates, Graduate Students, Postdocs, and Federal Agencies: Methodology, Data, and Science for Federal Statistics Cressie, Noel; Holan, Scott H.; Wikle, Christopher K. Presentation at the NCRN Spring 2015 Meeting}, url = {http://hdl.handle.net/1813/40179}, author = {Cressie, Noel and Holan, Scott H. and Wikle, Christopher K.} } @article {1787, title = {Record Linkage using STATA: Pre-processing, Linking and Reviewing Utilities}, journal = {The Stata Journal}, volume = {15}, year = {2015}, pages = {1-15}, abstract = {In this article, we describe Stata utilities that facilitate probabilistic record linkage{\textemdash}the technique typically used for merging two datasets with no common record identifier. While the preprocessing tools are developed specifically for linking two company databases, the other tools can be used for many different types of linkage. Specifically, the stnd_compname and stnd_address commands parse and standardize company names and addresses to improve the match quality when linking. The reclink2 command is a generalized version of Blasnik{\textquoteright}s reclink (2010, Statistical Software Components S456876, Department of Economics, Boston College) that allows for many-to-one matching. Finally, clrevmatch is an interactive tool that allows the user to review matched results in an efficient and seamless manner. Rather than exporting results to another file format (for example, Excel), inputting clerical reviews, and importing back into Stata, one can use the clrevmatch tool to conduct all of these steps within Stata. This helps improve the speed and flexibility of matching, which often involves multiple runs.}, url = {http://www.stata-journal.com/article.html?article=dm0082}, author = {Wasi, Nada and Flaaen, Aaron} } @article {2015arXiv:1502.01974, title = {Regionalization of Multiscale Spatial Processes using a Criterion for Spatial Aggregation Error}, journal = {ArXiv}, year = {2015}, abstract = {The modifiable areal unit problem and the ecological fallacy are known problems that occur when modeling multiscale spatial processes. We investigate how these forms of spatial aggregation error can guide a regionalization over a spatial domain of interest. By "regionalization" we mean a specification of geographies that define the spatial support for areal data. This topic has been studied vigorously by geographers, but has been given less attention by spatial statisticians. Thus, we propose a criterion for spatial aggregation error (CAGE), which we minimize to obtain an optimal regionalization. To define CAGE we draw a connection between spatial aggregation error and a new multiscale representation of the Karhunen-Loeve (K-L) expansion. This relationship between CAGE and the multiscale K-L expansion leads to illuminating theoretical developments including: connections between spatial aggregation error, squared prediction error, spatial variance, and a novel extension of Obled-Creutin eigenfunctions. The effectiveness of our approach is demonstrated through an analysis of two datasets, one using the American Community Survey and one related to environmental ocean winds.}, url = {http://arxiv.org/abs/1502.01974}, author = {Bradley, J.~R. and Wikle, C.K. and Holan, S.~H.} } @article {1931, title = {Simultaneous Edit-Imputation for Continuous Microdata}, journal = {Journal of the American Statistical Association}, volume = {110}, year = {2015}, pages = {987-999}, doi = {10.1080/01621459.2015.1040881}, url = {http://www.tandfonline.com/doi/abs/10.1080/01621459.2015.1040881}, author = {Kim, H. J. and Cox, L. H. and Karr, A. F. and Reiter, J. P. and Wang, Q.} } @article {1742, title = {Small Area Estimation via Multivariate Fay-Herriot Models With Latent Spatial Dependence}, journal = {Australian \& New Zealand Journal of Statistics}, volume = {57}, year = {2015}, pages = {15-29}, url = {http://arxiv.org/abs/1310.7211}, author = {Porter, A.T. and Wikle, C.K. and Holan, S.H.} } @article {STA4:STA494, title = {Spatio-temporal change of support with application to American Community Survey multi-year period estimates}, journal = {Stat}, volume = {4}, year = {2015}, month = {10/2015}, pages = {255{\textendash}270}, abstract = {We present hierarchical Bayesian methodology to perform spatio-temporal change of support (COS) for survey data with Gaussian sampling errors. This methodology is motivated by the American Community Survey (ACS), which is an ongoing survey administered by the US Census Bureau that provides timely information on several key demographic variables. The ACS has published 1-year, 3-year, and 5-year period estimates, and margins of errors, for demographic and socio-economic variables recorded over predefined geographies. The spatio-temporal COS methodology considered here provides data users with a way to estimate ACS variables on customized geographies and time periods while accounting for sampling errors. Additionally, 3-year ACS period estimates are to be discontinued, and this methodology can provide predictions of ACS variables for 3-year periods given the available period estimates. The methodology is based on a spatio-temporal mixed-effects model with a low-dimensional spatio-temporal basis function representation, which provides multi-resolution estimates through basis function aggregation in space and time. This methodology includes a novel parameterization that uses a target dynamical process and recently proposed parsimonious Moran{\textquoteright}s I propagator structures. Our approach is demonstrated through two applications using public-use ACS estimates and is shown to produce good predictions on a hold-out set of 3-year period estimates. Copyright {\textcopyright} 2015 John Wiley \& Sons, Ltd.}, keywords = {Bayesian, change-of-support, dynamical, hierarchical models, mixed-effects model, Moran{\textquoteright}s I, multi-year period estimate}, issn = {2049-1573}, doi = {10.1002/sta4.94}, url = {http://dx.doi.org/10.1002/sta4.94}, author = {Bradley, Jonathan R. and Wikle, Christopher K. and Holan, Scott H.} } @article {2091, title = {A stochastic bioenergetics model based approach to translating large river flow and temperature in to fish population responses: the pallid sturgeon example}, journal = {Geological Society}, volume = {408}, year = {2015}, issn = {2041-4927}, doi = {10.1144/SP408.10}, author = {Wildhaber, M.L. and Dey, R. and Wikle, C.K. and Anderson, C.J. and Moran, E.H. and Franz, K.J.} } @conference {2104, title = {Using Machine Learning Techniques to Predict Respondent Type from A Priori Demographic Information}, booktitle = {70th Annual Conference of the American Association for Public Opinion Research (AAPOR)}, year = {2015}, month = {May 14-17, 2015}, address = {Hollywood, Florida}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Atkin, G. and Arunachalam, H. and Eck, A. and Wettlaufer, D. and Soh, L.-K. and Belli, R.F.} } @article {Wikle2014, title = {Agent Based Models: Statistical Challenges and Opportunities}, journal = {Statistics Views}, year = {2014}, publisher = {Wiley}, url = {http://www.statisticsviews.com/details/feature/6354691/Agent-Based-Models-Statistical-Challenges-and-Opportunities.html}, author = {Wikle, C.K.} } @article {1527, title = {Bayesian estimation of disclosure risks for multiply imputed, synthetic data}, journal = {Journal of Privacy and Confidentiality}, volume = {6}, year = {2014}, month = {2014}, abstract = {

Agencies seeking to disseminate public use microdata, i.e., data on individual records, can replace confidential values with multiple draws from statistical models estimated with the collected data. We present a famework for evaluating disclosure risks inherent in releasing multiply-imputed, synthetic data. The basic idea is to mimic an intruder who computes posterior distributions of confidential values given the released synthetic data and prior knowledge. We illustrate the methodology with artificial fully synthetic data and with partial synthesis of the Survey of Youth in Custody.

}, url = {http://repository.cmu.edu/jpc/vol6/iss1/2}, author = {Reiter, J. P. and Wang, Q. and Zhang, B.} } @techreport {handle:1813:44702, title = {CED 2 AR: The Comprehensive Extensible Data Documentation and Access Repository}, number = {1813:44702}, year = {2014}, institution = {Cornell University}, type = {Preprint}, abstract = {CED 2 AR: The Comprehensive Extensible Data Documentation and Access Repository Lagoze, Carl; Vilhuber, Lars; Williams, Jeremy; Perry, Benjamin; Block, William C. We describe the design, implementation, and deployment of the Comprehensive Extensible Data Documentation and Access Repository (CED 2 AR). This is a metadata repository system that allows researchers to search, browse, access, and cite confidential data and metadata through either a web-based user interface or programmatically through a search API, all the while re-reusing and linking to existing archive and provider generated metadata. CED 2 AR is distinguished from other metadata repository-based applications due to requirements that derive from its social science context. These include the need to cloak confidential data and metadata and manage complex provenance chains Presented at 2014 IEEE/ACM Joint Conference on Digital Libraries (JCDL), Sept 8-12, 2014}, url = {http://hdl.handle.net/1813/44702}, author = {Lagoze, Carl and Vilhuber, Lars and Williams, Jeremy and Perry, Benjamin and Block, William C.} } @techreport {HolanMcElroyWu2014, title = {The Cepstral Model for Multivariate Time Series: The Vector Exponential Model.}, number = {1406.0801}, year = {2014}, institution = {arXiv}, type = {preprint}, abstract = {

Vector autoregressive (VAR) models have become a staple in the analysis of multivariate time series and are formulated in the time domain as difference equations, with an implied covariance structure. In many contexts, it is desirable to work with a stable, or at least stationary, representation. To fit such models, one must impose restrictions on the coefficient matrices to ensure that certain determinants are nonzero; which, except in special cases, may prove burdensome. To circumvent these difficulties, we propose a flexible frequency domain model expressed in terms of the spectral density matrix. Specifically, this paper treats the modeling of covariance stationary vector-valued (i.e., multivariate) time series via an extension of the exponential model for the spectrum of a scalar time series. We discuss the modeling advantages of the vector exponential model and its computational facets, such as how to obtain Wold coefficients from given cepstral coefficients. Finally, we demonstrate the utility of our approach through simulation as well as two illustrative data examples focusing on multi-step ahead forecasting and estimation of squared coherence.

}, url = {http://arxiv.org/abs/1406.0801}, author = {Holan, S.H. and McElroy, T.S. and Wu, G.} } @conference {2168, title = {Data Quality among Devices to Complete Surveys: Comparing Personal Computers, Smartphones and Tablets}, booktitle = {Midwest Association for Public Opinion Research Annual Meeting}, year = {2014}, address = {Chicago, IL}, url = {http://www.mapor.org/conferences.html}, author = {Wang, Mengyang and McCutcheon, Allan L.} } @inbook {1576, title = {Disclosure risk evaluation for fully synthetic data}, booktitle = {Privacy in Statistical Databases}, volume = {8744}, year = {2014}, pages = {185-199}, publisher = {Springer}, organization = {Springer}, address = {Heidelberg}, author = {J. Hu and J.P. Reiter and Q. Wang} } @article {1518, title = {Multiple imputation of missing or faulty values under linear constraints}, journal = {Journal of Business and Economic Statistics}, volume = {32}, year = {2014}, pages = {375-386}, chapter = {375}, abstract = {

Many statistical agencies, survey organizations, and research centers collect data that suffer from item nonresponse and erroneous or inconsistent values. These data may be required to satisfy linear constraints, for example, bounds on individual variables and inequalities for ratios or sums of variables. Often these constraints are designed to identify faulty values, which then are blanked and imputed. The data also may exhibit complex distributional features, including nonlinear relationships and highly nonnormal distributions. We present a fully Bayesian, joint model for modeling or imputing data with missing/blanked values under linear constraints that (i) automatically incorporates the constraints in inferences and imputations, and (ii) uses a flexible Dirichlet process mixture of multivariate normal distributions to reflect complex distributional features. Our strategy for estimation is to augment the observed data with draws from a hypothetical population in which the constraints are not present, thereby taking advantage of computationally expedient methods for fitting mixture models. Missing/blanked items are sampled from their posterior distribution using the Hit-and-Run sampler, which guarantees that all imputations satisfy the constraints. We illustrate the approach using manufacturing data from Colombia, examining the potential to preserve joint distributions and a regression from the plant productivity literature. Supplementary materials for this article are available online.

}, doi = {10.1080/07350015.2014.885435}, author = {Kim, H. J. and Reiter, J. P. and Wang, Q. and Cox, L. H. and Karr, A. F.} } @techreport {handle:1813:37750, title = {NCRN Meeting Fall 2014: Bayesian Marked Point Process Modeling for Generating Fully Synthetic Public Use Data with Point-Referenced Geography}, number = {1813:37750}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Fall 2014: Bayesian Marked Point Process Modeling for Generating Fully Synthetic Public Use Data with Point-Referenced Geography Quick, Harrison; Holan, Scott; Wikle, Christopher; Reiter, Jerry Presentation from NCRN Fall 2014 meeting}, url = {http://hdl.handle.net/1813/37750}, author = {Quick, Harrison and Holan, Scott and Wikle, Christopher and Reiter, Jerry} } @techreport {handle:1813:37446, title = {NCRN Meeting Fall 2014: Change in Visible Impervious Surface Area in Southeastern Michigan Before and After the \"Great Recession\"}, number = {1813:37446}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Fall 2014: Change in Visible Impervious Surface Area in Southeastern Michigan Before and After the \"Great Recession\" Wilson, Courtney; Brown, Daniel G. Presentation at Fall 2014 NCRN meeting}, url = {http://hdl.handle.net/1813/37446}, author = {Wilson, Courtney and Brown, Daniel G.} } @techreport {handle:1813:37749, title = {NCRN Meeting Fall 2014: Mixed Effects Modeling for Multivariate-Spatio-Temporal Areal Data}, number = {1813:37749}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Fall 2014: Mixed Effects Modeling for Multivariate-Spatio-Temporal Areal Data Bradley, Jonathan; Holan, Scott; Wikle, Christopher Presentation from NCRN Fall 2014 meeting}, url = {http://hdl.handle.net/1813/37749}, author = {Bradley, Jonathan and Holan, Scott and Wikle, Christopher} } @techreport {handle:1813:36392, title = {NCRN Meeting Spring 2014: Integrating PROV with DDI: Mechanisms of Data Discovery within the U.S. Census Bureau}, number = {1813:36392}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2014: Integrating PROV with DDI: Mechanisms of Data Discovery within the U.S. Census Bureau Block, William; Brown, Warren; Williams, Jeremy; Vilhuber, Lars; Lagoze, Carl presentation at NCRN Spring 2014 meeting}, url = {http://hdl.handle.net/1813/36392}, author = {Block, William and Brown, Warren and Williams, Jeremy and Vilhuber, Lars and Lagoze, Carl} } @techreport {handle:1813:36396, title = {NCRN Meeting Spring 2014: Summer Working Group for Employer List Linking (SWELL)}, number = {1813:36396}, year = {2014}, institution = {NCRN Coordinating Office}, type = {Preprint}, abstract = {NCRN Meeting Spring 2014: Summer Working Group for Employer List Linking (SWELL) Gathright, Graton; Kutzbach, Mark; Mccue, Kristin; McEntarfer, Erika; Monti, Holly; Trageser, Kelly; Vilhuber, Lars; Wasi, Nada; Wignall, Christopher Presentation for NCRN Spring 2014 meeting}, url = {http://hdl.handle.net/1813/36396}, author = {Gathright, Graton and Kutzbach, Mark and Mccue, Kristin and McEntarfer, Erika and Monti, Holly and Trageser, Kelly and Vilhuber, Lars and Wasi, Nada and Wignall, Christopher} } @article {Porter2014a, title = {Spatial Fay-Herriot Models for Small Area Estimation with Functional Covariates}, journal = {Spatial Statistics}, volume = {10}, year = {2014}, pages = {27-42}, url = {http://arxiv.org/pdf/1303.6668v3.pdf}, author = {Porter, A. T., and Holan, S.H., and Wikle, C.K., and Cressie, N.} } @conference {woo:pih:acq:2014, title = {Would a Privacy Fundamentalist Sell their DNA for \$1000... if Nothing Bad Happened Thereafter? A Study of the Western Categories, Behavior Intentions, and Consequences}, booktitle = {Proceedings of the Tenth Symposium on Usable Privacy and Security (SOUPS)}, year = {2014}, note = {IAPP SOUPS Privacy Award Winner}, publisher = {ACM}, organization = {ACM}, address = {New York, NY}, url = {https://www.usenix.org/conference/soups2014/proceedings/presentation/woodruff}, author = {Woodruff, A. and Pihur, V. and Acquisti, A. and Consolvo, S. and Schmidt, L. and Brandimarte, L.} } @conference {Wu2013b, title = {Bayesian Modeling in the Era of Big Data: the Role of High-Throughput and High-Performance Computing}, booktitle = {The Extreme Science and Engineering Discovery Environment Conference}, year = {2013}, month = {July}, address = {San Diego, CA}, author = {Wu, G.} } @conference {Wu2013a, title = {Binomial Mixture Models for Urban Ecological Monitoring Studies Using American Community Survey Demographic Covariates}, booktitle = {Joint Statistical Meetings 2013}, year = {2013}, month = {August}, address = {Montreal, Canada}, author = {Wu, G.} } @article {DBLP:journals/ijdc/LagozeBWAV13, title = {Data Management of Confidential Data}, journal = {International Journal of Digital Curation}, volume = {8}, number = {1}, year = {2013}, note = {Presented at 8th International Digital Curation Conference 2013, Amsterdam. See also http://hdl.handle.net/1813/30924}, pages = {265-278}, abstract = {Social science researchers increasingly make use of data that is confidential because it contains linkages to the identities of people, corporations, etc. The value of this data lies in the ability to join the identifiable entities with external data such as genome data, geospatial information, and the like. However, the confidentiality of this data is a barrier to its utility and curation, making it difficult to fulfill US federal data management mandates and interfering with basic scholarly practices such as validation and reuse of existing results. We describe the complexity of the relationships among data that span a public and private divide. We then describe our work on the CED2AR prototype, a first step in providing researchers with a tool that spans this divide and makes it possible for them to search, access, and cite that data.}, doi = {10.2218/ijdc.v8i1.259}, author = {Carl Lagoze and William C. Block and Jeremy Williams and John M. Abowd and Lars Vilhuber} } @conference {2167, title = {Do {\textquoteleft}Don{\textquoteright}t Know{\textquoteright} Responses = Survey Satisficing? Evidence from the Gallup Panel Paradata}, booktitle = {American Association for Public Opinion Research 2013 Annual Conference}, year = {2013}, address = {Boston, MA}, url = {http://www.aapor.org/AAPORKentico/Conference/Recent-Conferences.aspx}, author = {Wang, Mengyang and Ruppanner, Leah and McCutcheon, Allan L.} } @conference {Wikle2013b, title = {Ecological Prediction with Nonlinear Multivariate Time-Frequency Functional Data Models}, booktitle = {Joint Statistical Meetings 2013}, year = {2013}, month = {August}, address = {Montreal, Canada}, author = {Wikle, C.K.} } @article {Holan2014c, title = {Ecological Prediction With Nonlinear Multivariate Time-Frequency Functional Data Models}, journal = {Journal of Agricultural, Biological, and Environmental Statistics}, volume = {18}, year = {2013}, chapter = {450-474}, doi = {10.1007/s13253-013-0142-1}, url = {http://link.springer.com/article/10.1007/s13253-013-0142-1}, author = {Yang, W.H., and Wikle, C.K. and Holan, S.H. and Wildhaber, M.L.} } @conference {LagozeEtAl2013b, title = {Encoding Provenance Metadata for Social Science Datasets}, booktitle = {Metadata and Semantics Research}, series = {Communications in Computer and Information Science}, volume = {390}, year = {2013}, pages = {123-134}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, keywords = {DDI, eSocial Science, Metadata, Provenance}, isbn = {978-3-319-03436-2}, doi = {10.1007/978-3-319-03437-9_13}, url = {http://dx.doi.org/10.1007/978-3-319-03437-9_13}, author = {Lagoze, Carl and Willliams, Jeremy and Vilhuber, Lars}, editor = {Garoufallou, Emmanouel and Greenberg, Jane} } @techreport {handle:1813:34443, title = {Encoding Provenance of Social Science Data: Integrating PROV with DDI}, number = {1813:34443}, year = {2013}, institution = {Cornell University}, type = {Preprint}, abstract = {Encoding Provenance of Social Science Data: Integrating PROV with DDI Lagoze, Carl; Block, William C; Williams, Jeremy; Abowd, John; Vilhuber, Lars Provenance is a key component of evaluating the integrity and reusability of data for scholarship. While recording and providing access provenance has always been important, it is even more critical in the web environment in which data from distributed sources and of varying integrity can be combined and derived. The PROV model, developed under the auspices of the W3C, is a foundation for semantically-rich, interoperable, and web-compatible provenance metadata. We report on the results of our experimentation with integrating the PROV model into the DDI metadata for a complex, but characteristic, example social science data. We also present some preliminary thinking on how to visualize those graphs in the user interface. Submitted to EDDI13 5th Annual European DDI User Conference December 2013, Paris, France}, url = {http://hdl.handle.net/1813/34443}, author = {Lagoze, Carl and Block, William C and Williams, Jeremy and Abowd, John and Vilhuber, Lars} } @conference {LagozeEtAl2013, title = {Encoding Provenance of Social Science Data: Integrating PROV with DDI}, booktitle = {5th Annual European DDI User Conference}, year = {2013}, abstract = {Provenance is a key component of evaluating the integrity and reusability of data for scholarship. While recording and providing access provenance has always been important, it is even more critical in the web environment in which data from distributed sources and of varying integrity can be combined and derived. The PROV model, developed under the auspices of the W3C, is a foundation for semantically-rich, interoperable, and web-compatible provenance metadata. We report on the results of our experimentation with integrating the PROV model into the DDI metadata for a complex, but characteristic, example social science data. We also present some preliminary thinking on how to visualize those graphs in the user interface.}, keywords = {DDI, eSocial Science, Metadata, Provenance}, author = {Carl Lagoze and William C. Block and Jeremy Williams and Lars Vilhuber} } @article {wan:leo:chen:2013, title = {From Facebook Regrets to Facebook Privacy Nudges}, journal = {Ohio State Law Journal}, year = {2013}, note = {Invited paper}, author = {Wang, Y. and Leon, P. G. and Chen, X. and Komanduri, S. and Norcie, G. and Scott, K. and Acquisti, A. and Cranor, L. F. and Sadeh, N.} } @article {Wikle2013d, title = {Hierarchical Bayesian Spatio-Temporal Conway-Maxwell Poisson Models with Dynamic Dispersion}, journal = {Journal of Agricultural, Biological, and Environmental Statistics}, volume = {18}, year = {2013}, pages = {335-356}, address = {Anchorage, Alaska}, doi = {10.1007/s13253-013-0141-2}, url = {http://link.springer.com/article/10.1007/s13253-013-0141-2}, author = {Wu, G. and Holan, S.H. and Wikle, C.K.} } @article {Wikle2013, title = {Hierarchical Spatio-Temporal Models and Survey Research}, journal = {Statistics Views}, year = {2013}, month = {May}, url = {http://www.statisticsviews.com/details/feature/4730991/Hierarchical-Spatio-Temporal-Models-and-Survey-Research.html}, author = {Wikle, C. and Holan, S. and Cressie, N.} } @booklet {Cressie2013, title = {How can survey estimates of small areas be improved by leveraging social-media data?}, journal = {The Survey Statistician}, number = {68}, year = {2013}, month = {July}, url = {http://isi.cbs.nl/iass/N68.pdf}, author = {Cressie, N. and Holan, S. and Wikle, C.} } @techreport {handle:1813:33362, title = {Improving User Access to Metadata for Public and Restricted Use US Federal Statistical Files}, number = {1813:33362}, year = {2013}, institution = {Cornell University}, type = {Preprint}, abstract = {Improving User Access to Metadata for Public and Restricted Use US Federal Statistical Files Block, William C.; Williams, Jeremy; Vilhuber, Lars; Lagoze, Carl; Brown, Warren; Abowd, John M. Presentation at NADDI 2013 This record has also been archived at http://kuscholarworks.ku.edu/dspace/handle/1808/11093 .}, url = {http://hdl.handle.net/1813/33362}, author = {Block, William C. and Williams, Jeremy and Vilhuber, Lars and Lagoze, Carl and Brown, Warren and Abowd, John M.} } @techreport {handle:1813:34534, title = {Managing Confidentiality and Provenance across Mixed Private and Publicly-Accessed Data and Metadata}, number = {1813:34534}, year = {2013}, institution = {Cornell University}, type = {Preprint}, abstract = {Managing Confidentiality and Provenance across Mixed Private and Publicly-Accessed Data and Metadata Vilhuber, Lars; Abowd, John; Block, William; Lagoze, Carl; Williams, Jeremy Social science researchers are increasingly interested in making use of confidential micro-data that contains linkages to the identities of people, corporations, etc. The value of this linking lies in the potential to join these identifiable entities with external data such as genome data, geospatial information, and the like. Leveraging these linkages is an essential aspect of {\textquotedblleft}big data{\textquotedblright} scholarship. However, the utility of these confidential data for scholarship is compromised by the complex nature of their management and curation. This makes it difficult to fulfill US federal data management mandates and interferes with basic scholarly practices such as validation and reuse of existing results. We describe in this paper our work on the CED2AR prototype, a first step in providing researchers with a tool that spans the confidential/publicly-accessible divide, making it possible for researchers to identify, search, access, and cite those data. The particular points of interest in our work are the cloaking of metadata fields and the expression of provenance chains. For the former, we make use of existing fields in the DDI (Data Description Initiative) specification and suggest some minor changes to the specification. For the latter problem, we investigate the integration of DDI with recent work by the W3C PROV working group that has developed a generalizable and extensible model for expressing data provenance.}, url = {http://hdl.handle.net/1813/34534}, author = {Vilhuber, Lars and Abowd, John and Block, William and Lagoze, Carl and Williams, Jeremy} } @conference {Wikle2013e, title = {Nonlinear Dynamic Spatio-Temporal Statistical Models}, booktitle = {Southern Regional Council on Statistics Summer Research Conference}, year = {2013}, month = {June}, author = {Wikle, C.K.} } @inbook {Holan2013, title = {Spatio-temporal Design: Advances in Efficient Data Acquisition}, booktitle = {Spatio-temporal Design: Advances in Efficient Data Acquisition}, year = {2013}, pages = {269-284}, publisher = {Wiley}, organization = {Wiley}, chapter = {Semiparametric Dynamic Design of Monitoring Networks for Non-Gaussian Spatio-Temporal Data}, keywords = {semiparametric dynamic design for non-Gaussian spatio-temporal data}, isbn = {9780470974292}, doi = {10.1002/9781118441862}, author = {Holan, S. and Wikle, C.}, editor = {Jorge Mateu and Werner Muller} } @booklet {Wikle2013c, title = {Statistics and the Environment: Overview and Challenges}, year = {2013}, note = {Invited Introductory Overview Lecture}, month = {May}, author = {Wikle, C.K.} } @mastersthesis {Wilson2013, title = {Using Satellite Imagery to Evaluate and Analyze Socioeconomic Changes Observed with Census Data}, year = {2013}, note = {NCRN}, type = {Ph.D.}, author = {Wilson, C. R.} } @article {Holan2012, title = {An Approach for Identifying and Predicting Economic Recessions in Real-Time Using Time-Frequency Functional Models}, journal = {Applied Stochastic Models in Business and Industry}, volume = {28}, year = {2012}, note = {DOI: 10.1002/asmb.1954}, month = {12/2012}, pages = {485-499}, keywords = {Bayesian model averaging, business cycles, empirical orthogonal functions, functional data, MIDAS, spectrogram, stochastic search variable selection}, doi = {10.1002/asmb.1954}, url = {http://onlinelibrary.wiley.com/doi/10.1002/asmb.1954/full}, author = {Holan, S. and Yang, W. and Matteson, D. and Wikle, C.K.} } @article {Wang2012, title = {Bayesian Multi-Regime Smooth Transition Regression with Ordered Categorical Variables}, journal = {Computational Statistics and Data Analysis}, volume = {56}, year = {2012}, note = {http://dx.doi.org/10.1016/j.csda.2012.04.018}, month = {December}, pages = {4165-4179}, doi = {10.1016/j.csda.2012.04.018}, url = {http://dx.doi.org/10.1016/j.csda.2012.04.018}, author = {Wang, J. and Holan, S.} } @conference {Wikle2012c, title = {Change of Support in Spatio-Temporal Dynamical Models}, booktitle = {Joint Statistical Meetings}, year = {2012}, month = {August}, address = {Montreal, Canada}, author = {Wikle, C.K.} } @techreport {handle:1813:30924, title = {Data Management of Confidential Data}, number = {1813:30924}, year = {2012}, institution = {Cornell University}, type = {Preprint}, abstract = {Data Management of Confidential Data Lagoze, Carl; Block, William C.; Williams, Jeremy; Abowd, John M.; Vilhuber, Lars Social science researchers increasingly make use of data that is confidential because it contains linkages to the identities of people, corporations, etc. The value of this data lies in the ability to join the identifiable entities with external data such as genome data, geospatial information, and the like. However, the confidentiality of this data is a barrier to its utility and curation, making it difficult to fulfill US federal data management mandates and interfering with basic scholarly practices such as validation and reuse of existing results. We describe the complexity of the relationships among data that span a public and private divide. We then describe our work on the CED2AR prototype, a first step in providing researchers with a tool that spans this divide and makes it possible for them to search, access, and cite that data.}, url = {http://hdl.handle.net/1813/30924}, author = {Lagoze, Carl and Block, William C. and Williams, Jeremy and Abowd, John M. and Vilhuber, Lars} } @techreport {handle:1813:30922, title = {An Early Prototype of the Comprehensive Extensible Data Documentation and Access Repository (CED2AR)}, number = {1813:30922}, year = {2012}, institution = {Cornell University}, type = {Preprint}, abstract = {An Early Prototype of the Comprehensive Extensible Data Documentation and Access Repository (CED2AR) Block, William C.; Williams, Jeremy; Abowd, John M.; Vilhuber, Lars; Lagoze, Carl This presentation will demonstrate the latest DDI-related technological developments of Cornell University{\textquoteright}s $3 million NSF-Census Research Network (NCRN) award, dedicated to improving the documentation, discoverability, and accessibility of public and restricted data from the federal statistical system in the United States. The current internal name for our DDI-based system is the Comprehensive Extensible Data Documentation and Access Repository (CED{\texttwosuperior}AR). CED{\texttwosuperior}AR ingests metadata from heterogeneous sources and supports filtered synchronization between restricted and public metadata holdings. Currently-supported CED{\texttwosuperior}AR {\textquotedblleft}connector workflows{\textquotedblright} include mechanisms to ingest IPUMS, zero-observation files from the American Community Survey (DDI 2.1), and SIPP Synthetic Beta (DDI 1.2). These disparate metadata sources are all transformed into a DDI 2.5 compliant form and stored in a single repository. In addition, we will demonstrate an extension to DDI 2.5 that allows for the labeling of elements within the schema to indicate confidentiality. This metadata can then be filtered, allowing the creation of derived public use metadata from an original confidential source. This repository is currently searchable online through a prototype application demonstrating the ability to search across previously heterogeneous metadata sources. Presentation at the 4th Annual European DDI User Conference (EDDI12), Norwegian Social Science Data Services, Bergen, Norway, 3 December, 2012}, url = {http://hdl.handle.net/1813/30922}, author = {Block, William C. and Williams, Jeremy and Abowd, John M. and Vilhuber, Lars and Lagoze, Carl} } @conference {Brandimarte2012, title = {The Economics of Privacy}, booktitle = {The Oxford Handbook of the Digital Economy}, year = {2012}, pages = {547-570}, publisher = {Oxford University Press}, organization = {Oxford University Press}, isbn = {9780195397840}, doi = {10.1093/oxfordhb/9780195397840.013.0020}, author = {Laura Brandimarte and Alessandro Acquisti}, editor = {Martin Peitz and Joel Waldfogel} } @booklet {Wikle2012b, title = {Efficient Time-Frequency Representations in High-Dimensional Spatial and Spatio-Temporal Models}, year = {2012}, month = {October}, author = {Wikle, C.K.} } @techreport {handle:1813:55327, title = {Encoding Provenance Metadata for Social Science Datasets}, number = {1813:55327}, year = {2012}, institution = {Cornell University}, type = {Preprint}, abstract = {Encoding Provenance Metadata for Social Science Datasets Lagoze, Carl; Williams, Jeremy; Vilhuber, Lars Recording provenance is a key requirement for data-centric scholarship, allowing researchers to evaluate the integrity of source data sets and re- produce, and thereby, validate results. Provenance has become even more critical in the web environment in which data from distributed sources and of varying integrity can be combined and derived. Recent work by the W3C on the PROV model provides the foundation for semantically-rich, interoperable, and web-compatible provenance metadata. We apply that model to complex, but characteristic, provenance examples of social science data, describe scenarios that make scholarly use of those provenance descriptions, and propose a manner for encoding this provenance metadata within the widely-used DDI metadata standard. Submitted to Metadata and Semantics Research (MTSR 2013) conference.}, url = {http://hdl.handle.net/1813/55327}, author = {Lagoze, Carl and Williams, Jeremy and Vilhuber, Lars} } @inbook {NIPS2012_1456, title = {Entropy Estimations Using Correlated Symmetric Stable Random Projections}, booktitle = {Advances in Neural Information Processing Systems 25}, year = {2012}, pages = {3185{\textendash}3193}, url = {http://books.nips.cc/papers/files/nips25/NIPS2012_1456.pdf}, author = {Ping Li and Cun-Hui Zhang}, editor = {P. Bartlett and F.C.N. Pereira and C.J.C. Burges and L. Bottou and K.Q. Weinberger} } @conference {2166, title = {Exploring interviewer and respondent interactions: An innovative behavior coding approach}, booktitle = {Midwest Association for Public Opinion Research 2012 Annual Conference}, year = {2012}, address = {Chicago, IL}, url = {http://www.mapor.org/conferences.html}, author = {Walton, L. and Stange, M. and Powell, R. and Belli, R.F.} } @conference {Wikle2012, title = {Hierarchical General Quadratic Nonlinear Models for Spatio-Temporal Dynamics}, booktitle = {Red Raider Conference}, year = {2012}, month = {October}, publisher = {Texas Tech University}, organization = {Texas Tech University}, address = {Lubbock, TX}, author = {Wikle, C.K.} } @techreport {handle:1813:30925, title = {The NSF-Census Research Network: Cornell Node}, number = {1813:30925}, year = {2012}, institution = {Cornell University}, type = {Preprint}, abstract = {The NSF-Census Research Network: Cornell Node Block, William C.; Lagoze, Carl; Vilhuber, Lars; Brown, Warren A.; Williams, Jeremy; Arguillas, Florio Cornell University has received a $3M NSF-Census Research Network (NCRN) award to improve the documentation and discoverability of both public and restricted data from the federal statistical system. The current internal name for this project is the Comprehensive Extensible Data Documentation and Access Repository (CED{\texttwosuperior}AR). The diagram to the right provides a high level architectural overview of the system to be implemented. The CED{\texttwosuperior}AR will be based upon leading metadata standards such as the Data Documentation Initiative (DDI) and Statistical Data and Metadata eXchange (SDMX) and be flexibly designed to ingest documentation from a variety of source files. It will permit synchronization between the public and confidential instances of the repository. The scholarly community will be able to use the CED{\texttwosuperior}AR as it would a conventional metadata repository, deprived only of the values of certain confidential information, but not their metadata. The authorized user, working on the secure Census Bureau network, could use the CED{\texttwosuperior}AR with full information in authorized domains.}, url = {http://hdl.handle.net/1813/30925}, author = {Block, William C. and Lagoze, Carl and Vilhuber, Lars and Brown, Warren A. and Williams, Jeremy and Arguillas, Florio} } @inbook {NIPS2012_1436, title = {One Permutation Hashing}, booktitle = {Advances in Neural Information Processing Systems 25}, year = {2012}, pages = {3122{\textendash}3130}, url = {http://books.nips.cc/papers/files/nips25/NIPS2012_1436.pdf}, author = {Ping Li and Art Owen and Cun-Hui Zhang}, editor = {P. Bartlett and F.C.N. Pereira and C.J.C. Burges and L. Bottou and K.Q. Weinberger} } @article {Holan2012b, title = {Rejoinder: An approach for identifying and predicting economic recessions in real time using time frequency functional models}, journal = {Applied Stochastic Models in Business and Industry}, volume = {28}, year = {2012}, pages = {504-505}, doi = {10.1002/asmb.1955}, url = {http://onlinelibrary.wiley.com/doi/10.1002/asmb.1955/full}, author = {Holan, S. and Yang, W. and Matteson, D. and Wikle, C.} } @inbook {Holan2012a, title = {Semiparametric Dynamic Design of Monitoring Networks for Non-Gaussian Spatio-Temporal Data}, booktitle = {Spatio-temporal Design: Advances in Efficient Data Acquisition}, year = {2012}, pages = {269-284}, publisher = {Wiley}, organization = {Wiley}, address = {Chichester, UK}, doi = {10.1002/9781118441862.ch12}, url = {http://onlinelibrary.wiley.com/doi/10.1002/9781118441862.ch12/summary}, author = {Holan, S. and Wikle, C.K.}, editor = {Jorge Mateu and Werner Muller} } @booklet {Wikle2012a, title = {Spatio-Temporal Statistics at Mizzou, Truman School of Public Affairs}, year = {2012}, month = {October}, author = {Wikle, C.K.} } @article {2661, title = {An ensemble quadratic echo state network for nonlinear spatio-temporal forecasting}, journal = {Stat}, abstract = {Spatio-temporal data and processes are prevalent across a wide variety of scientific disciplines. These processes are often characterized by nonlinear time dynamics that include interactions across multiple scales of spatial and temporal variability. The data sets associated with many of these processes are increasing in size due to advances in automated data measurement, management, and numerical simulator output. Non- linear spatio-temporal models have only recently seen interest in statistics, but there are many classes of such models in the engineering and geophysical sciences. Tradi- tionally, these models are more heuristic than those that have been presented in the statistics literature, but are often intuitive and quite efficient computationally. We show here that with fairly simple, but important, enhancements, the echo state net- work (ESN) machine learning approach can be used to generate long-lead forecasts of nonlinear spatio-temporal processes, with reasonable uncertainty quantification, and at only a fraction of the computational expense of a traditional parametric nonlinear spatio-temporal models.}, url = {https://arxiv.org/abs/1708.05094}, author = {McDermott, P.L. and Wikle, C.K.} }