Seasons 2.0#
Combining individual-level and group-level data#
Demonstration of the mepoisson
command in Stata; ultimately translate to R#
1#
* Document session using .log file
capture log close
log using poisson.log, replace
* Setting timeout to 10000 milliseconds
set timeout1 10000
* Comment indicating external data source
global repo "https://github.com/muzaale/got/raw/main/act_5/act_5_9/"
* Actual loading of data
use ${repo}donor_live_keep.dta, clear
* Change directory and load the dataset
cd "~/dropbox/1f.ἡἔρις,κ/1.ontology"
if 1 {
* Recode donor relationship and race variables
recode don_relation (1/4=1)(5/999=0), gen(related)
recode don_race (8=1)(16=2)(2000=3)(64=4)(24/1500 .=5), ///
gen(racecat)
* Create year and month variables based on donation recovery date
gen year = year(don_recov_dt)
gen month = month(don_recov_dt)
* Tabulate donations by year
tab year
* Tabulate month and calculate the June-to-January ratio
tab month, matcell(cellvalues)
local freq_jan = cellvalues[1,1]
local freq_jun = cellvalues[6,1]
local ratio = `freq_jun' / `freq_jan'
di `ratio'
* Create a constant variable 'count' and recode 'month' for summer vs. rest of the year
gen count = 1
recode month (1/5 9/12=0)(6/8=1), gen(summer)
}
* Currently deactivated by 'if 0': but more efficient than "collapse"
if 0 {
* Count donations by year and by month-year
egen n_month = count(don_recov_dt), by(year month summer related)
}
* We have individual-level data to this point
* Preserve the original dataset
preserve
collapse (count) donations=pers_id, by(summer)
di (donations[2]/3)/(donations[1]/9) //back-of-envelope irr
restore
preserve
* Collapse the data by year, month, summer, and relation
collapse (count) donations=pers_id, by(year month summer related)
* Generate and save graphs for related and unrelated donors
twoway ///
(scatter donations year if related==1) ///
(lowess donations year if related==1, ///
legend(off) ///
xlab(1990(10)2020) ///
xti("") ///
)
graph save related.gph, replace
twoway ///
(scatter donations year if related==0) ///
(lowess donations year if related==0, ///
legend(off) ///
xlab(1990(10)2020) ///
xti("") ///
)
graph save unrelated.gph, replace
* Combine graphs and save as PNG
graph combine related.gph unrelated.gph, ///
ycommon ///
ti("Number of living kidney donors per month")
graph export donations_years.png, replace
* Save the collapsed data to a temporary file
tempfile aggregated_data
save `aggregated_data'
restore
* Merge the aggregated count back to the individual-level data
merge m:1 year month summer related using `aggregated_data'
* Now, you've both group-level and individual-level variables as predictors
* Run the single-level Poisson model first to get starting estimates
poisson donations summer related, irr iter(5)
* Capture the estimates for future use
matrix start_vals = e(b)
if 0 {
* Placeholder for setting initial estimates
matrix list start_vals * To check the starting values before using them
* Run multilevel Poisson model using initial estimates
mepoisson donations || summer:, irr from(start_vals) iter(3)
}
log close
2#
----------------------------------------------------------------------------------------------------------------
name: <unnamed>
log: /Users/d/Dropbox (Personal)/1f.ἡἔρις,κ/1.ontology/poisson.log
log type: text
opened on: 5 Sep 2023, 15:18:48
.
. * Setting timeout to 10000 milliseconds
. set timeout1 10000
.
. * Comment indicating external data source
. global repo "https://github.com/muzaale/got/raw/main/act_5/act_5_9/"
.
. * Actual loading of data
. use ${repo}donor_live_keep.dta, clear
.
. * Change directory and load the dataset
. cd "~/dropbox/1f.ἡἔρις,κ/1.ontology"
/Users/d/Dropbox (Personal)/1f.ἡἔρις,κ/1.ontology
.
.
. if 1 {
.
. * Recode donor relationship and race variables
. recode don_relation (1/4=1)(5/999=0), gen(related)
(161,880 differences between don_relationship_ty and related)
. recode don_race (8=1)(16=2)(2000=3)(64=4)(24/1500 .=5), ///
> gen(racecat)
(186,545 differences between don_race and racecat)
.
. * Create year and month variables based on donation recovery date
. gen year = year(don_recov_dt)
. gen month = month(don_recov_dt)
.
. * Tabulate donations by year
. tab year
year | Freq. Percent Cum.
------------+-----------------------------------
1987 | 402 0.22 0.22
1988 | 1,829 0.98 1.20
1989 | 1,918 1.03 2.22
1990 | 2,122 1.14 3.36
1991 | 2,427 1.30 4.66
1992 | 2,571 1.38 6.04
1993 | 2,905 1.56 7.60
1994 | 3,104 1.66 9.26
1995 | 3,496 1.87 11.14
1996 | 3,803 2.04 13.17
1997 | 4,067 2.18 15.36
1998 | 4,570 2.45 17.80
1999 | 5,046 2.70 20.51
2000 | 5,947 3.19 23.70
2001 | 6,621 3.55 27.25
2002 | 6,631 3.55 30.80
2003 | 6,828 3.66 34.46
2004 | 7,005 3.76 38.22
2005 | 6,904 3.70 41.92
2006 | 6,733 3.61 45.53
2007 | 6,315 3.39 48.91
2008 | 6,218 3.33 52.25
2009 | 6,610 3.54 55.79
2010 | 6,561 3.52 59.31
2011 | 6,023 3.23 62.54
2012 | 5,868 3.15 65.68
2013 | 5,989 3.21 68.89
2014 | 5,820 3.12 72.01
2015 | 5,989 3.21 75.22
2016 | 5,975 3.20 78.42
2017 | 6,181 3.31 81.74
2018 | 6,845 3.67 85.41
2019 | 7,389 3.96 89.37
2020 | 5,725 3.07 92.44
2021 | 6,539 3.51 95.94
2022 | 6,466 3.47 99.41
2023 | 1,103 0.59 100.00
------------+-----------------------------------
Total | 186,545 100.00
.
. * Tabulate month and calculate the June-to-January ratio
. tab month, matcell(cellvalues)
month | Freq. Percent Cum.
------------+-----------------------------------
1 | 15,331 8.22 8.22
2 | 14,245 7.64 15.85
3 | 15,096 8.09 23.95
4 | 14,290 7.66 31.61
5 | 14,788 7.93 39.53
6 | 17,795 9.54 49.07
7 | 17,460 9.36 58.43
8 | 16,147 8.66 67.09
9 | 14,361 7.70 74.79
10 | 15,490 8.30 83.09
11 | 15,598 8.36 91.45
12 | 15,944 8.55 100.00
------------+-----------------------------------
Total | 186,545 100.00
. local freq_jan = cellvalues[1,1]
. local freq_jun = cellvalues[6,1]
. local ratio = `freq_jun' / `freq_jan'
. di `ratio'
1.1607201
.
. * Create a constant variable 'count' and recode 'month' for summer vs. rest of the year
. gen count = 1
. recode month (1/5 9/12=0)(6/8=1), gen(summer)
(186,545 differences between month and summer)
.
. }
.
.
. * Currently deactivated by 'if 0': but more efficient than "collapse"
. if 0 {
.
. * Count donations by year and by month-year
. egen n_month = count(don_recov_dt), by(year month summer related)
.
. }
.
. * We have individual-level data to this point
. * Preserve the original dataset
. preserve
. collapse (count) donations=pers_id, by(summer)
. di (donations[2]/3)/(donations[1]/9) //back-of-envelope irr
1.141058
. restore
. preserve
. * Collapse the data by year, month, summer, and relation
. collapse (count) donations=pers_id, by(year month summer related)
.
. * Generate and save graphs for related and unrelated donors
. twoway ///
> (scatter donations year if related==1) ///
> (lowess donations year if related==1, ///
> legend(off) ///
> xlab(1990(10)2020) ///
> xti("") ///
> )
. graph save related.gph, replace
file related.gph saved
. twoway ///
> (scatter donations year if related==0) ///
> (lowess donations year if related==0, ///
> legend(off) ///
> xlab(1990(10)2020) ///
> xti("") ///
> )
. graph save unrelated.gph, replace
file unrelated.gph saved
.
. * Combine graphs and save as PNG
. graph combine related.gph unrelated.gph, ///
> ycommon ///
> ti("Number of living kidney donors per month")
. graph export donations_years.png, replace
file /Users/d/Dropbox (Personal)/1f.ἡἔρις,κ/1.ontology/donations_years.png saved as PNG format
.
. * Save the collapsed data to a temporary file
. tempfile aggregated_data
. save `aggregated_data'
file /var/folders/sx/fd6zgj191mx45hspzbgwzlnr0000gn/T//S_16143.000003 saved as .dta format
. restore
.
. * Merge the aggregated count back to the individual-level data
. merge m:1 year month summer related using `aggregated_data'
Result Number of obs
-----------------------------------------
Not matched 0
Matched 186,545 (_merge==3)
-----------------------------------------
.
. * Now, you've both group-level and individual-level variables as predictors
. * Run the single-level Poisson model first to get starting estimates
. poisson donations summer related, irr iter(5)
Iteration 0: Log likelihood = -2917852
Iteration 1: Log likelihood = -2917852
Poisson regression Number of obs = 184,671
LR chi2(2) = 369742.82
Prob > chi2 = 0.0000
Log likelihood = -2917852 Pseudo R2 = 0.0596
------------------------------------------------------------------------------
donations | IRR Std. err. z P>|z| [95% conf. interval]
-------------+----------------------------------------------------------------
summer | 1.148387 .0003555 446.89 0.000 1.14769 1.149084
related | .8867447 .0002529 -421.42 0.000 .8862491 .8872406
_cons | 272.2428 .060666 2.5e+04 0.000 272.1239 272.3617
------------------------------------------------------------------------------
Note: _cons estimates baseline incidence rate.
.
. * Capture the estimates for future use
. matrix start_vals = e(b)
.
. if 0 {
.
. * Placeholder for setting initial estimates
. matrix list start_vals * To check the starting values before using them
.
. * Run multilevel Poisson model using initial estimates
. mepoisson donations || summer:, irr from(start_vals) iter(3)
.
. }
.
. log close
name: <unnamed>
log: /Users/d/Dropbox (Personal)/1f.ἡἔρις,κ/1.ontology/poisson.log
log type: text
closed on: 5 Sep 2023, 15:18:56
----------------------------------------------------------------------------------------------------------------
3#
4#
If your single-level Poisson model has provided reasonable results, you can potentially use these to set the starting values for the corresponding parameters in the multi-level model. However, note that mepoisson
doesn’t have a straightforward from()
option like some other optimization functions in Stata to input starting values.
Here’s an example with the hypothetical Stata code. In this case, the if 1 {}
part is a condition to check; you can replace it with an actual condition or ignore it:
* Run the single-level Poisson model first to get starting estimates
poisson donations summer related, irr iter(5)
* Capture the estimates
matrix start_vals = e(b)
* Display the estimates; just for checking
di "Starting values from the single-level model:"
matrix list start_vals
* Run the multi-level model, possibly using starting values (note that mepoisson does not allow you to specify starting values directly)
mepoisson donations || summer:, irr iter(3)
In your pseudo-code, you mention merging the aggregated data back to the individual-level data. If you’ve actually aggregated the data, remember that this will affect the fit of your model. Also, it’s worth mentioning that Stata does not permit setting the starting values in the mepoisson
command, so the from(start_vals)
part won’t be applicable here.
If convergence continues to be a problem, you may want to explore whether Stata’s ML engine options (like different optimization algorithms or providing analytical derivatives if possible) could be useful, though these can be quite technical and are generally used when other options have failed.
Remember to interpret the results carefully, especially when there are convergence issues, as they may not be reliable.
I’m open to suggestions including python and machine learning
If you are interested in Python-based solutions, you could use the statsmodels
package, which provides classes for estimating and interpreting models for many different statistical models, including Poisson and Negative Binomial models. The scikit-learn
package also provides a PoissonRegressor
class for fitting Poisson regression models.
For hierarchical/multilevel models, the pymc3
package allows for Bayesian hierarchical models, and you can specify initial values.
Here’s a simplified example using statsmodels
:
import pandas as pd
import statsmodels.api as sm
# Let's assume `df` is your DataFrame containing 'donations', 'summer', 'related'
# df = pd.read_csv('your_data.csv')
# Adding constant term for intercept
df['intercept'] = 1.0
# Defining the dependent variable (y) and independent variables (X)
y = df['donations']
X = df[['intercept', 'summer', 'related']]
# Fit Poisson regression model
model = sm.GLM(y, X, family=sm.families.Poisson()).fit()
# Show model summary
print(model.summary())
For a hierarchical model with pymc3
:
import pymc3 as pm
import numpy as np
# Again, assume 'donations', 'summer', 'related' are your data
# donations = np.array(df['donations'])
# summer = np.array(df['summer'])
# related = np.array(df['related'])
with pm.Model() as model:
# Hyperpriors
mu_intercept = pm.Normal('mu_intercept', mu=0, sd=1)
sigma_intercept = pm.HalfNormal('sigma_intercept', sd=1)
mu_slope = pm.Normal('mu_slope', mu=0, sd=1)
sigma_slope = pm.HalfNormal('sigma_slope', sd=1)
# Intercept and Slope
Intercept = pm.Normal('Intercept', mu=mu_intercept, sd=sigma_intercept, shape=len(set(summer)))
Slope = pm.Normal('Slope', mu=mu_slope, sd=sigma_slope, shape=len(set(summer)))
# Expected value
theta = pm.math.exp(Intercept[summer] + Slope[summer]*related)
# Likelihood
donations_obs = pm.Poisson('donations_obs', mu=theta, observed=donations)
# Fitting the model
trace = pm.sample(2000)
You can inspect the trace
object to look at parameter estimates and other diagnostic information.
Remember that this is a simplified example, and you’ll need to adjust the models according to your actual data and research questions.
5#
Certainly, I can help translate your Stata script (combining individual-level and group-level data) to R code. Below is the equivalent R code that should accomplish the same tasks you’ve laid out in the Stata script.
Firstly, you would need to install and load the required R packages if you haven’t already:
install.packages(c("data.table", "dplyr", "ggplot2", "lme4"))
library(data.table)
library(dplyr)
library(ggplot2)
library(lme4)
Now, let’s move on to the script.
# Document session using .log file
sink("poisson.log")
# Setting timeout to 10000 milliseconds
options(timeout = 10000)
# Comment indicating external data source
repo <- "https://github.com/muzaale/got/raw/main/act_5/act_5_9/"
# Actual loading of data
data <- fread(paste0(repo, "donor_live_keep.dta"))
# Change directory and load the dataset
setwd("~/dropbox/1f.ἡἔρις,κ/1.ontology")
# Recode donor relationship and race variables
data <- data %>%
mutate(related = ifelse(don_relation %in% 1:4, 1, 0),
racecat = case_when(
don_race == 8 ~ 1,
don_race == 16 ~ 2,
don_race == 2000 ~ 3,
don_race == 64 ~ 4,
TRUE ~ 5
))
# Create year and month variables based on donation recovery date
data$year <- format(as.Date(data$don_recov_dt), "%Y")
data$month <- format(as.Date(data$don_recov_dt), "%m")
# Tabulate donations by year
table(data$year)
# Tabulate month and calculate the June-to-January ratio
month_tab <- table(data$month)
freq_jan <- month_tab["01"]
freq_jun <- month_tab["06"]
ratio <- freq_jun / freq_jan
print(ratio)
# Create a constant variable 'count' and recode 'month' for summer vs. rest of the year
data <- data %>%
mutate(count = 1,
summer = ifelse(month %in% c("06", "07", "08"), 1, 0))
# Collapsing the data (your comment indicated this as more efficient than "collapse")
if (0) {
data <- data %>%
group_by(year, month, summer, related) %>%
summarise(n_month = n())
}
# Running single-level Poisson model first to get starting estimates
poisson_model <- glm(donations ~ summer + related, family = poisson, data = data)
start_vals <- coef(poisson_model)
# Placeholder for setting initial estimates
# To check the starting values before using them
print(start_vals)
# Run multilevel Poisson model using initial estimates (measured in iterations)
# Uncomment this block if you want to proceed with the multilevel Poisson model
if (0) {
mepoisson_model <- glmer(donations ~ (1 | summer), family = poisson, data = data, start = start_vals)
}
# Close the log
sink()
This should accomplish the same tasks as your original Stata code. Let me know if you have any questions or need further clarification.