Examples

Complete Workflow Example

Here’s a complete example showing how to analyze RDS data from start to finish:

import pandas as pd
from RDSTools import (
    load_toy_data, RDSdata, RDSmean, RDStable, RDSlm,
    RDSnetgraph, RDSmap, get_available_seeds, print_map_info
)

# 1. Load and examine your data
# Option A: Use the included example dataset
toy_data = load_toy_data()

# Option B: Load your own data
#
# data = pd.read_csv("rds_survey.csv")
print(data.columns)
print(f"Total participants: {len(data)}")

# 2. Process the RDS structure
rds_data = RDSdata(
    data=data,
    unique_id="ID",
    redeemed_coupon="RecruitCoupon",
    issued_coupons=["Coupon_1", "Coupon_2", "Coupon_3"],
    degree="NetworkSize",
    zero_degree="median",
    NA_degree="hotdeck"
)

# Check the processed data
print(f"Seeds: {rds_data['SEED'].sum()}")
print(f"Max wave: {rds_data['WAVE'].max()}")

# 3. Calculate means with parallel processing
mean_age = RDSmean(
    x='Age',
    data=rds_data,
    weight='WEIGHT',
    var_est='tree_uni1',
    resample_n=1000,
    n_cores=4
)
print(mean_age)

# 4. Generate frequency tables
sex_table = RDStable(
    x='Sex',
    data=rds_data,
    weight='WEIGHT',
    var_est='tree_uni1',
    resample_n=1000
)
print(sex_table)

# Two-way table
cross_table = RDStable(
    x='Sex',
    y='Race',
    data=rds_data,
    weight='WEIGHT',
    var_est='tree_uni1',
    resample_n=1000,
    margins=1  # row proportions
)
print(cross_table)

# 5. Fit regression models
income_model = RDSlm(
    data=rds_data,
    formula='Income ~ Age + C(Sex) + C(Race)',
    weight='WEIGHT',
    var_est='tree_uni1',
    resample_n=2000,
    n_cores=6
)
print(income_model)

Descriptive Statistics Examples

Unweighted mean with naive variance:

result = RDSmean(
    x='Age',
    data=rds_data,
    var_est=None  # naive method
)

Weighted mean with bootstrap variance:

result = RDSmean(
    x='Age',
    data=rds_data,
    weight='WEIGHT',
    var_est='chain1',
    resample_n=1000
)

Return bootstrap means for custom analysis:

result, bootstrap_means, node_counts = RDSmean(
    x='Age',
    data=rds_data,
    var_est='tree_uni1',
    resample_n=1000,
    return_bootstrap_means=True,
    return_node_counts=True
)

# Analyze bootstrap distribution
import numpy as np
print(f"Bootstrap mean: {np.mean(bootstrap_means)}")
print(f"Bootstrap SE: {np.std(bootstrap_means)}")

Table Examples

One-way table with different margin options:

# Simple one-way table
table = RDStable(
    x='Sex',
    data=rds_data
)

# Weighted one-way table with bootstrap
table = RDStable(
    x='Race',
    data=rds_data,
    weight='WEIGHT',
    var_est='tree_uni1',
    resample_n=500
)

Two-way tables with different proportions:

# Cell proportions (default)
table_cell = RDStable(
    x='Sex',
    y='Race',
    data=rds_data,
    margins=3
)

# Row proportions
table_row = RDStable(
    x='Sex',
    y='Race',
    data=rds_data,
    margins=1
)

# Column proportions
table_col = RDStable(
    x='Sex',
    y='Race',
    data=rds_data,
    margins=2
)

Regression Examples

Simple linear regression:

model = RDSlm(
    data=rds_data,
    formula='Income ~ Age'
)

Multiple linear regression with categorical predictors:

model = RDSlm(
    data=rds_data,
    formula='Income ~ Age + C(Sex) + C(Education) + C(Race)',
    weight='WEIGHT',
    var_est='tree_uni1',
    resample_n=1000,
    n_cores=4
)

Logistic regression:

# Binary outcome (0/1)
model = RDSlm(
    data=rds_data,
    formula='Employed ~ Age + C(Sex) + C(Education)',
    var_est='chain1',
    resample_n=500
)

Return bootstrap estimates:

model, boot_estimates, node_counts = RDSlm(
    data=rds_data,
    formula='Income ~ Age + C(Sex)',
    var_est='tree_uni1',
    resample_n=1000,
    return_bootstrap_estimates=True,
    return_node_counts=True
)

Network Visualization Examples

Basic network graph with different layouts:

# Spring layout (default)
G = RDSnetgraph(
    data=rds_data,
    seed_ids=['1', '2'],
    waves=[0, 1, 2, 3],
    layout='Spring'
)

# Tree layout (hierarchical)
G = RDSnetgraph(
    data=rds_data,
    seed_ids=['1'],
    waves=[0, 1, 2, 3, 4],
    layout='Tree',
    save_path='recruitment_tree.png'
)

# Circular layout
G = RDSnetgraph(
    data=rds_data,
    seed_ids=['1', '2'],
    waves=[0, 1, 2],
    layout='Circular',
    figsize=(12, 12)
)

Color nodes by demographic variables:

# Color by Sex
G = RDSnetgraph(
    data=rds_data,
    seed_ids=['1', '2', '3'],
    waves=[0, 1, 2],
    layout='Kamada-Kawai',
    variable='Sex',
    node_size=50,
    figsize=(16, 14)
)

# Color by Race
G = RDSnetgraph(
    data=rds_data,
    seed_ids=['1'],
    waves=[0, 1, 2, 3],
    layout='Spring',
    variable='Race',
    node_size=40
)

Use custom colors for categories:

# First, check what categories exist (they'll be sorted)
print(sorted(rds_data['Race'].dropna().unique()))
# Output: ['Asian', 'Black', 'Hispanic', 'White']

# Provide colors in the same sorted order
custom_colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#95E1D3']

G = RDSnetgraph(
    data=rds_data,
    seed_ids=['1', '2'],
    waves=[0, 1, 2],
    variable='Race',
    category_colors=custom_colors,
    title='Recruitment by Race (Custom Colors)'
)

# Using named colors instead of hex codes
G = RDSnetgraph(
    data=rds_data,
    seed_ids=['1', '2', '3'],
    waves=[0, 1, 2, 3],
    variable='Sex',
    category_colors=['purple', 'orange'],  # For 2 categories
    layout='Tree',
    save_path='network_custom.png'
)

Geographic Mapping Examples

Check available mapping data:

# Print comprehensive map information
print_map_info(rds_data, lat='Latitude', long='Longitude')

# Get available seeds and waves
seeds = get_available_seeds(rds_data)
waves = get_available_waves(rds_data)
print(f"Seeds: {seeds}")
print(f"Waves: {waves}")

Basic map:

m = RDSmap(
    data=rds_data,
    seed_ids=['1', '2'],
    waves=[0, 1, 2, 3],
    output_file='my_rds_map.html'
)

Map with custom coordinates and settings:

m = RDSmap(
    data=rds_data,
    seed_ids=['1', '2', '3'],
    waves=[0, 1, 2, 3, 4],
    lat='lat',
    long='long',
    output_file='geographic_map.html',
    zoom_start=10,
    open_browser=True
)

Bootstrap Examples

Standalone bootstrap resampling:

from RDSTools import RDSboot

# Standard bootstrap
boot_results = RDSboot(
    data=rds_data,
    respondent_id_col='ID',
    seed_id_col='S_ID',
    seed_col='SEED',
    recruiter_id_col='R_ID',
    type='tree_uni1',
    resample_n=1000
)

# Check first resample
sample_1 = boot_results[boot_results['RESAMPLE.N'] == 1]
merged = pd.merge(sample_1, rds_data,
                 left_on='RESPONDENT_ID', right_on='ID')
print(f"Bootstrap sample size: {len(merged)}")

Parallel bootstrap for large datasets:

from RDSTools import RDSBootOptimizedParallel

boot_results = RDSBootOptimizedParallel(
    data=rds_data,
    respondent_id_col='ID',
    seed_id_col='S_ID',
    seed_col='SEED',
    recruiter_id_col='R_ID',
    type='tree_uni1',
    resample_n=10000,
    n_cores=8
)

Performance Comparison

The parallel bootstrap provides significant speedups:

Performance Comparison (252 observations)
Cores	Bootstrap Samples	Standard Time	Parallel Time	Speedup
1	1000	120s	120s	1.0x
4	1000	120s	18s	6.7x
8	1000	120s	12s	10.0x

Complete Analysis Pipeline

Here’s a complete pipeline from data loading to final results:

import pandas as pd
from RDSTools import (
    load_toy_data, RDSdata, RDSmean, RDStable, RDSlm,
    RDSnetgraph, RDSmap, get_available_seeds
)

# Load data
data = pd.read_csv("survey.csv")

# Process RDS structure
rds_data = RDSdata(
    data=data,
    unique_id="ID",
    redeemed_coupon="CouponR",
    issued_coupons=["Coupon1", "Coupon2", "Coupon3"],
    degree="Degree"
)

# Descriptive statistics
age_mean = RDSmean(
    x='Age',
    data=rds_data,
    weight='WEIGHT',
    var_est='tree_uni1',
    resample_n=1000,
    n_cores=4
)

# Frequency tables
sex_table = RDStable(
    x='Sex',
    data=rds_data,
    weight='WEIGHT',
    var_est='tree_uni1',
    resample_n=1000
)

race_sex_table = RDStable(
    x='Sex',
    y='Race',
    data=rds_data,
    weight='WEIGHT',
    var_est='tree_uni1',
    resample_n=1000,
    margins=1
)

# Regression analysis
model = RDSlm(
    data=rds_data,
    formula='Income ~ Age + C(Sex) + C(Race) + C(Education)',
    weight='WEIGHT',
    var_est='tree_uni1',
    resample_n=2000,
    n_cores=4
)

# Visualizations
seeds = get_available_seeds(rds_data)

# Network graph
G = RDSnetgraph(
    data=rds_data,
    seed_ids=seeds[:2],
    waves=[0, 1, 2, 3],
    layout='Spring',
    variable='Sex',
    save_path='network.png'
)

# Geographic map
m = RDSmap(
    data=rds_data,
    seed_ids=seeds[:2],
    waves=[0, 1, 2, 3],
    output_file='map.html',
    open_browser=True
)

# Print results
print("Age Mean:")
print(age_mean)
print("\nSex Distribution:")
print(sex_table)
print("\nRegression Model:")
print(model)