@techreport {2655, title = {Computationally Efficient Multivariate Spatio-Temporal Models for High-Dimensional Count-Valued Data. (With Discussion).}, number = {1512.07273}, year = {2017}, abstract = {We introduce a Bayesian approach for multivariate spatio-temporal prediction for high-dimensional count-valued data. Our primary interest is when there are possibly millions of data points referenced over different variables, geographic regions, and times. This problem requires extensive methodological advancements, as jointly modeling correlated data of this size leads to the so-called "big n problem." The computational complexity of prediction in this setting is further exacerbated by acknowledging that count-valued data are naturally non-Gaussian. Thus, we develop a new computationally efficient distribution theory for this setting. In particular, we introduce a multivariate log-gamma distribution and provide substantial theoretical development including: results regarding conditional distributions, marginal distributions, an asymptotic relationship with the multivariate normal distribution, and full-conditional distributions for a Gibbs sampler. To incorporate dependence between variables, regions, and time points, a multivariate spatio-temporal mixed effects model (MSTM) is used. The results in this manuscript are extremely general, and can be used for data that exhibit fewer sources of dependency than what we consider (e.g., multivariate, spatial-only, or spatio-temporal-only data). Hence, the implications of our modeling framework may have a large impact on the general problem of jointly modeling correlated count-valued data. We show the effectiveness of our approach through a simulation study. Additionally, we demonstrate our proposed methodology with an important application analyzing data obtained from the Longitudinal Employer-Household Dynamics (LEHD) program, which is administered by the U.S. Census Bureau.}, keywords = {Aggregation, American Community Survey, Bayesian hierarchical model, Big Data, Longitudinal Employer-Household Dynamics (LEHD) program, Markov chain Monte Carlo, Non-Gaussian., Quarterly Workforce Indicators}, url = {https://arxiv.org/abs/1512.07273}, author = {Bradley, J.R. and Holan, S.H. and Wikle, C.K.} } @article {2656, title = {Multi-rubric Models for Ordinal Spatial Data with Application to Online Ratings from Yelp}, year = {2017}, abstract = {Interest in online rating data has increased in recent years. Such data consists of ordinal ratings of products or local businesses provided by users of a website, such as \Yelp\ or \texttt{Amazon}. One source of heterogeneity in ratings is that users apply different standards when supplying their ratings; even if two users benefit from a product the same amount, they may translate their benefit into ratings in different ways. In this article we propose an ordinal data model, which we refer to as a multi-rubric model, which treats the criteria used to convert a latent utility into a rating as user-specific random effects, with the distribution of these random effects being modeled nonparametrically. We demonstrate that this approach is capable of accounting for this type of variability in addition to usual sources of heterogeneity due to item quality, user biases, interactions between items and users, and the spatial structure of the users and items. We apply the model developed here to publicly available data from the website \Yelp\ and demonstrate that it produces interpretable clusterings of users according to their rating behavior, in addition to providing better predictions of ratings and better summaries of overall item quality.}, keywords = {Bayesian hierarchical model, Data augmentation, Nonparametric Bayes, ordinal data, recommender systems, spatial prediction.}, url = {https://arxiv.org/abs/1706.03012}, author = {Linero, A.R. and Bradley, J.R. and Desai, A.} } @article {2657, title = {Regionalization of Multiscale Spatial Processes using a Criterion for Spatial Aggregation Error}, journal = {Journal of the Royal Statistical Society -- Series B.}, year = {2017}, abstract = {The modifiable areal unit problem and the ecological fallacy are known problems that occur when modeling multiscale spatial processes. We investigate how these forms of spatial aggregation error can guide a regionalization over a spatial domain of interest. By "regionalization" we mean a specification of geographies that define the spatial support for areal data. This topic has been studied vigorously by geographers, but has been given less attention by spatial statisticians. Thus, we propose a criterion for spatial aggregation error (CAGE), which we minimize to obtain an optimal regionalization. To define CAGE we draw a connection between spatial aggregation error and a new multiscale representation of the Karhunen-Loeve (K-L) expansion. This relationship between CAGE and the multiscale K-L expansion leads to illuminating theoretical developments including: connections between spatial aggregation error, squared prediction error, spatial variance, and a novel extension of Obled-Creutin eigenfunctions. The effectiveness of our approach is demonstrated through an analysis of two datasets, one using the American Community Survey and one related to environmental ocean winds.}, keywords = {American Community Survey, empirical orthogonal functions, MAUP, Reduced rank, Spatial basis functions, Survey data}, url = {https://arxiv.org/abs/1502.01974}, author = {Bradley, J.R. and Wikle, C.K. and Holan, S.H.} } @article {2665, title = {Bayesian Hierarchical Models with Conjugate Full-Conditional Distributions for Dependent Data from the Natural Exponential Family}, journal = {Journal of the American Statistical Association - T\&M.}, year = {2016}, abstract = {We introduce a Bayesian approach for analyzing (possibly) high-dimensional dependent data that are distributed according to a member from the natural exponential family of distributions. This problem requires extensive methodological advancements, as jointly modeling high-dimensional dependent data leads to the so-called "big n problem." The computational complexity of the "big n problem" is further exacerbated when allowing for non-Gaussian data models, as is the case here. Thus, we develop new computationally efficient distribution theory for this setting. In particular, we introduce something we call the "conjugate multivariate distribution," which is motivated by the univariate distribution introduced in Diaconis and Ylvisaker (1979). Furthermore, we provide substantial theoretical and methodological development including: results regarding conditional distributions, an asymptotic relationship with the multivariate normal distribution, conjugate prior distributions, and full-conditional distributions for a Gibbs sampler. The results in this manuscript are extremely general, and can be adapted to many different settings. We demonstrate the proposed methodology through simulated examples and analyses based on estimates obtained from the US Census Bureaus{\textquoteright} American Community Survey (ACS).}, url = {https://arxiv.org/abs/1701.07506}, author = {Bradley, J.R. and Holan, S.H. and Wikle, C.K.} } @article {2666, title = {Bayesian Spatial Change of Support for Count-Valued Survey Data with Application to the American Community Survey}, journal = {Journal of the American Statistical Association}, year = {2016}, pages = {472-487}, abstract = {We introduce Bayesian spatial change of support methodology for count-valued survey data with known survey variances. Our proposed methodology is motivated by the American Community Survey (ACS), an ongoing survey administered by the U.S. Census Bureau that provides timely information on several key demographic variables. Specifically, the ACS produces 1-year, 3-year, and 5-year "period-estimates," and corresponding margins of errors, for published demographic and socio-economic variables recorded over predefined geographies within the United States. Despite the availability of these predefined geographies it is often of interest to data users to specify customized user-defined spatial supports. In particular, it is useful to estimate demographic variables defined on "new" spatial supports in "real-time." This problem is known as spatial change of support (COS), which is typically performed under the assumption that the data follows a Gaussian distribution. However, count-valued survey data is naturally non-Gaussian and, hence, we consider modeling these data using a Poisson distribution. Additionally, survey-data are often accompanied by estimates of error, which we incorporate into our analysis. We interpret Poisson count-valued data in small areas as an aggregation of events from a spatial point process. This approach provides us with the flexibility necessary to allow ACS users to consider a variety of spatial supports in "real-time." We demonstrate the effectiveness of our approach through a simulated example as well as through an analysis using public-use ACS data.}, url = {https://arxiv.org/abs/1405.7227}, author = {Bradley, J.R. and Wikle, C.K. and Holan, S.H.} } @article {2669, title = {Multivariate Spatio-Temporal Survey Fusion with Application to the American Community Survey and Local Area Unemployment Statistics}, journal = {Stat}, year = {2016}, pages = {224 - 233}, abstract = {There are often multiple surveys available that estimate and report related demographic variables of interest that are referenced over space and/or time. Not all surveys produce the same information, and thus, combining these surveys typically leads to higher quality estimates. That is, not every survey has the same level of precision nor do they always provide estimates of the same variables. In addition, various surveys often produce estimates with incomplete spatio-temporal coverage. By combining surveys using a Bayesian approach, we can account for different margins of error and leverage dependencies to produce estimates of every variable considered at every spatial location and every time point. Specifically, our strategy is to use a hierarchical modelling approach, where the first stage of the model incorporates the margin of error associated with each survey. Then, in a lower stage of the hierarchical model, the multivariate spatio-temporal mixed effects model is used to incorporate multivariate spatio-temporal dependencies of the processes of interest. We adopt a fully Bayesian approach for combining surveys; that is, given all of the available surveys, the conditional distributions of the latent processes of interest are used for statistical inference. To demonstrate our proposed methodology, we jointly analyze period estimates from the US Census Bureau{\textquoteright}s American Community Survey, and estimates obtained from the Bureau of Labor Statistics Local Area Unemployment Statistics program. Copyright {\textcopyright} 2016 John Wiley \& Sons, Ltd.}, url = {http://onlinelibrary.wiley.com/doi/10.1002/sta4.120/full}, author = {Bradley, J.R. and Holan, S.H. and Wikle, C.K} } @article {2083, title = {Comparing and selecting spatial predictors using local criteria}, journal = {Test}, volume = {24}, year = {2015}, month = {03/2015}, pages = {1-28}, chapter = {1}, issn = {1133-0686}, doi = {10.1007/s11749-014-0415-1}, url = {http://dx.doi.org/10.1007/s11749-014-0415-1}, author = {Bradley, J.R. and Cressie, N. and Shi, T.} } @article {1882, title = {Multiscale Analysis of Survey Data: Recent Developments and Exciting Prospects}, journal = {Statistics Views}, year = {2015}, author = {Bradley, J.R. and Wikle, C.K. and Holan, S.H.} } @article {2169, title = {Multivariate Spatio-Temporal Models for High-Dimensional Areal Data with Application to Longitudinal Employer-Household Dynamics}, journal = {Annals of Applied Statistics}, volume = {9}, year = {2015}, month = {03/2015}, abstract = {Many data sources report related variables of interest that are also referenced over geographic regions and time; however, there are relatively few general statistical methods that one can readily use that incorporate these multivariate spatio-temporal dependencies. Additionally, many multivariate spatio-temporal areal datasets are extremely high-dimensional, which leads to practical issues when formulating statistical models. For example, we analyze Quarterly Workforce Indicators (QWI) published by the US Census Bureau{\textquoteright}s Longitudinal Employer-Household Dynamics (LEHD) program. QWIs are available by different variables, regions, and time points, resulting in millions of tabulations. Despite their already expansive coverage, by adopting a fully Bayesian framework, the scope of the QWIs can be extended to provide estimates of missing values along with associated measures of uncertainty. Motivated by the LEHD, and other applications in federal statistics, we introduce the multivariate spatio-temporal mixed effects model (MSTM), which can be used to efficiently model high-dimensional multivariate spatio-temporal areal datasets. The proposed MSTM extends the notion of Moran{\textquoteright}s I basis functions to the multivariate spatio-temporal setting. This extension leads to several methodological contributions including extremely effective dimension reduction, a dynamic linear model for multivariate spatio-temporal areal processes, and the reduction of a high-dimensional parameter space using a novel parameter model.}, doi = {0.1214/15-AOAS862}, author = {Bradley, J.R. and Holan, S.H. and Wikle, C.K.} } @article {2084, title = {Rejoinder on: Comparing and selecting spatial predictors using local criteria}, journal = {Test}, volume = {24}, year = {2015}, month = {03/2015}, pages = {54-60}, issn = {1133-0686}, doi = {10.1007/s11749-014-0414-2}, url = {http://dx.doi.org/10.1007/s11749-014-0414-2}, author = {Bradley, J.R. and Cressie, N. and Shi, T.} } @conference {Bradley2014, title = {The Poisson Change of Support Problem with Applications to the American Community Survey}, booktitle = {Joint Statistical Meetings 2014}, year = {2014}, author = {Bradley, J.R.} } @conference {Bradley2014a, title = {Survey Fusion for Data that Exhibit Multivariate, Spatio-Temporal Dependencies}, booktitle = {Joint Statistical Meetings 2014}, year = {2014}, author = {Bradley, J.R.} } @booklet {Bradley2013, title = {A Reduced Rank Model for Analyzing Multivariate Spatial Datasets}, journal = {University of Missouri-Kansas City}, year = {2013}, month = {November}, publisher = {University of Missouri-Kansas City}, author = {Bradley, J.R.} }