Source code for generalizit.generalizit

import re
from typing import Tuple, Optional, Dict, List
import numpy as np
import pandas as pd
from generalizit.design import Design
from generalizit.design_utils import parse_facets, match_research_design, validate_research_design, get_facets_from_variance_tuple_dictionary
import warnings


[docs]
class GeneralizIT:
    """
    High-level API for Generalizability Theory analysis.
    
    GeneralizIT provides a user-friendly interface for conducting Generalizability Theory (G-Theory) 
    analyses, including variance component estimation, reliability coefficient calculation,
    Decision studies (D-studies), and confidence interval estimation.
    
    This class serves as a wrapper around the core analytical engine (Design class),
    handling data preparation, research design interpretation, and result presentation.
    
    Parameters:
        data (pd.DataFrame): Dataset containing facet variables and response measurements.
        design_str (str): String specifying the research design in standard notation 
                        (e.g., "p x i" for persons crossed with items).
        response (str): Column name in data containing the response/measurement values.
        
    Attributes:
        design (Design): The underlying Design object that performs calculations.
        
    Example:
        >>> import pandas as pd
        >>> from generalizit import GeneralizIT
        >>> data = pd.read_csv("my_data.csv")
        >>> gt = GeneralizIT(data, "p x i", "score")
        >>> gt.calculate_anova()
        >>> gt.calculate_g_coefficients()
        >>> gt.g_coefficients_summary()
    """
    def __init__(self, data: pd.DataFrame, design_str: str, response: str, variance_tuple_dictionary: Optional[Dict[str, Tuple[str, ...]]] = None):
        # Initialize the GeneralizIT class

        # If a variance tuple dictionary is not provided, create one
        if variance_tuple_dictionary is None:
            # First we parse the input string to get the research design
            design_num, facets = match_research_design(design_str)
            
            # Validate the research design
            try:
                validate_research_design(design_num)
            except ValueError as e:
                raise ValueError(e)
            
            variance_tuple_dictionary = parse_facets(design_num=design_num, design_facets=facets)
        else:
            # Validate the provided variance tuple dictionary
            if not isinstance(variance_tuple_dictionary, dict):
                raise ValueError("variance_tuple_dictionary must be a dictionary.")
            if not all(isinstance(k, str) and isinstance(v, tuple) for k, v in variance_tuple_dictionary.items()):
                raise ValueError("variance_tuple_dictionary must contain string keys and tuple values.")
            facets = get_facets_from_variance_tuple_dictionary(variance_tuple_dictionary)

        data, missing_data = self._clean_data(data=data, facets=facets, response=response)

        # Initialize the design class based on the research design
        self.design = Design(
            data=data,
            variance_tuple_dictionary=variance_tuple_dictionary,
            missing_data=missing_data,
            response_col=response
        )
            
    def _clean_data(self, data: pd.DataFrame, facets: list[str], response: str) -> Tuple[pd.DataFrame, bool]:
        """
        Prunes the input DataFrame by dropping columns that are not in the list of facets or the response variable. 

        Args:
            data (pd.DataFrame): The input DataFrame to be pruned.
            facets (list[str]): The list of facet column names.
            response (str): The response variable column name.

        Returns:
            pd.DataFrame: The pruned DataFrame containing only the columns specified in facets and the response variable.
            bool: A boolean indicating whether there are missing values in the data.
        """
        for col in data.columns:
            if col != response:
                data = data.rename(columns={col: re.sub(r"\s+", " ", col.strip().lower())})  # Normalize the column names
        
        # Combine factors and response variable into a single list
        variables = list(facets) + [response]
        # Create a list of columns to drop
        drop_cols = [col for col in data.columns if col not in variables]
        
        # Create a warning message if any columns are dropped
        if len(drop_cols) > 0:
            print("Warning: The following columns have been dropped from the data:")
            for col in drop_cols:
                print(col)
        
        # Drop the columns
        data = data.drop(columns=drop_cols)

        # Check for missing values
        if data.isnull().values.any():
            print("Warning: Missing values detected in the data.")
            missing_data = True
        else:
            missing_data = False

        return data, missing_data
    

[docs]
    def calculate_anova(self):
        """
        Calculate variance components using ANOVA.
        
        This method is a wrapper for the Design.calculate_anova() method, which
        implements Henderson's Method 1 to estimate variance components for each facet
        in the specified research design.
        
        Returns:
            None: Results are stored in the underlying Design object.
            
        Notes:
            This method must be called before calculating G-coefficients,
            confidence intervals, or D-study scenarios.
        """
        # Calculate the ANOVA table
        self.design.calculate_anova()

    

[docs]
    def calculate_g_coefficients(self, fixed_facets: Optional[List[str]] = None, **kwargs):
        """
        Calculate generalizability coefficients.
        
        This method is a wrapper for the Design.g_coeffs() method, which computes 
        phi-squared (Φ²) and rho-squared (ρ²) coefficients for each potential
        object of measurement in the design.
        
        Parameters:
            - fixed_facets Optional(List[str]): List of facets to be treated as fixed.
            **kwargs: Optional keyword arguments passed to Design.g_coeffs().
                - error_variance (bool): If True, prints detailed information about
                  error variances during calculation. Default is False.
                - Other parameters as documented in Design.g_coeffs().
                
        Returns:
            None: Results are stored in the underlying Design object.
            
        Raises:
            RuntimeError: If ANOVA table hasn't been calculated first.
        """
        # First check that the ANOVA table has been calculated
        if self.design.anova_table is None:
            raise RuntimeError("ANOVA table must be calculated first. Please run the calculate_anova() method.")
        
        # Check that the fixed facets are valid
        if fixed_facets is not None:
            if not isinstance(fixed_facets, list):
                raise ValueError("fixed_facets must be a list.")
            max_tuple = max(self.design.variance_tuple_dictionary.values(), key=len)
            if len(fixed_facets) > len(max_tuple) - 2:
                raise ValueError("The number of fixed facets cannot exceed the number of facets in the design minus 2.")
            all_facets = set(f for t in self.design.variance_tuple_dictionary.values() for f in t)
            for facet in fixed_facets:
                if facet not in all_facets:
                    raise ValueError(f"Fixed facet '{facet}' not found in the design facets: {all_facets}")
            print("Warning: Fixed facets should only be used with balanced designs without missing data.")
                
        # Calculate the G coefficients
        self.design.g_coeffs(fixed_facets=fixed_facets, **kwargs)

        

[docs]
    def g_coeffs(self, **kwargs):
        """
        [DEPRECATED] Calculate generalizability coefficients.
        
        This method is deprecated and will be removed in a future version.
        Please use `calculate_g_coefficients()` instead.
        
        See `calculate_g_coefficients()` for parameter details.
        """
        warnings.warn(
            "The g_coeffs() method is deprecated and will be removed in a future version. "
            "Please use calculate_g_coefficients() instead.",
            DeprecationWarning,
            stacklevel=2
        )
        return self.calculate_g_coefficients(**kwargs)

        

[docs]
    def calculate_d_study(self, d_study_design: Optional[dict] = None, fixed_facets: Optional[List[str]] = None, **kwargs):
        """
        Calculate G-coefficients for alternative research designs (D-Study).
        
        This method is a wrapper for the Design.calculate_d_study() method, which examines
        multiple possible study designs by generating combinations of provided facet levels.
        
        Parameters:
            - d_study_design (dict, optional): Dictionary where keys are facet names and values are 
                lists of integers representing different numbers of levels to test.
            - fixed_facets Optional(List[str]): List of facets to be treated as fixed.
            - **kwargs: Optional keyword arguments passed to Design.calculate_d_study().
                
        Returns:
            None: Results are stored in the underlying Design object.
            
        Raises:
            RuntimeError: If ANOVA table hasn't been calculated first.
        """
        # First check that the ANOVA table has been calculated
        if self.design.anova_table.empty:
            raise RuntimeError("ANOVA table must be calculated first. Please run the calculate_anova() method.")
        
        # Check that the fixed facets are valid
        if fixed_facets is not None:
            if not isinstance(fixed_facets, list):
                raise ValueError("fixed_facets must be a list.")
            max_tuple = max(self.design.variance_tuple_dictionary.values(), key=len)
            if len(fixed_facets) > len(max_tuple) - 2:
                raise ValueError("The number of fixed facets cannot exceed the number of facets in the design minus 2.")
            all_facets = set(f for t in self.design.variance_tuple_dictionary.values() for f in t)
            for facet in fixed_facets:
                if facet not in all_facets:
                    raise ValueError(f"Fixed facet '{facet}' not found in the design facets: {all_facets}")
            print("Warning: Fixed facets should only be used with balanced designs without missing data.")
        
        # Calculate the D study
        self.design.calculate_d_study(d_study_design=d_study_design, fixed_facets=fixed_facets, **kwargs)

        

[docs]
    def calculate_confidence_intervals(self, alpha: float = 0.05, **kwargs):
        """
        Calculate confidence intervals for facet level means.
        
        This method is a wrapper for the Design.calculate_confidence_intervals() method,
        which computes confidence intervals for individual facets based on variance 
        component analysis.
        
        Parameters:
            alpha (float, optional): Significance level for confidence intervals.
                Default is 0.05 (producing 95% confidence intervals).
            **kwargs: Optional keyword arguments passed to Design.calculate_confidence_intervals().
                
        Returns:
            None: Results are stored in the underlying Design object.
            
        Raises:
            RuntimeError: If ANOVA table hasn't been calculated first.
        """
        # First check that the ANOVA table has been calculated
        if self.design.anova_table.empty:
            raise RuntimeError("ANOVA table must be calculated first. Please run the calculate_anova() method.")
            
        # Calculate the confidence intervals
        self.design.calculate_confidence_intervals(alpha=alpha, **kwargs)

        
    # ----------------- Summary Methods -----------------  

[docs]
    def anova_summary(self):
        # First check that the ANOVA table has been calculated
        if self.design.anova_table.empty:
            raise RuntimeError("ANOVA table must be calculated first. Please run the calculate_anova() method.")
        
        # Print the ANOVA table
        self.design.anova_summary()



[docs]
    def variance_summary(self):
        # First check that the ANOVA table has been calculated
        if self.design.anova_table.empty:
            raise RuntimeError("ANOVA table must be calculated first. Please run the calculate_anova() method.")

        # Print the variance components
        self.design.variance_summary()

    

[docs]
    def g_coefficients_summary(self):
        # First check that the G coefficients have been calculated
        if self.design.g_coeffs_table.empty:
            raise RuntimeError("G coefficients must be calculated first. Please run the g_coeffs() method.")
        
        # Print the G coefficients
        self.design.g_coeff_summary()



[docs]
    def d_study_summary(self):
        # First check that the D study has been calculated
        if not self.design.d_study_dict:
            raise RuntimeError("D study must be calculated first. Please run the calculate_d_study() method.")
        
        # Print the D study
        self.design.d_study_summary()

        

[docs]
    def confidence_intervals_summary(self):
        # First check that the confidence intervals have been calculated
        if self.design.confidence_intervals is {}:
            raise RuntimeError("Confidence intervals must be calculated first. Please run the calculate_confidence_intervals() method.")
        
        # Print the confidence intervals
        self.design.confidence_intervals_summary()


# ---- End of GeneralizIT Class ----

# # ---- Wrapper Documentation ----
# GeneralizIT.calculate_anova.__doc__ = (
#     GeneralizIT.calculate_anova.__doc__ + 
#     "\n\n" + 
#     Design.anova_summary.__doc__
# )
# GeneralizIT.calculate_g_coefficients.__doc__ = (
#     GeneralizIT.calculate_g_coefficients.__doc__ + 
#     "\n\n" + 
#     Design.g_coeffs.__doc__
# )
# GeneralizIT.calculate_d_study.__doc__ = (
#     GeneralizIT.calculate_d_study.__doc__ + 
#     "\n\n" + 
#     Design.calculate_d_study.__doc__
# )
# GeneralizIT.calculate_confidence_intervals.__doc__ = (
#     GeneralizIT.calculate_confidence_intervals.__doc__ + 
#     "\n\n" + 
#     Design.calculate_confidence_intervals.__doc__
# )
# GeneralizIT.anova_summary.__doc__ = Design.anova_summary.__doc__
# GeneralizIT.variance_summary.__doc__ = Design.variance_summary.__doc__
# GeneralizIT.g_coefficients_summary.__doc__ = Design.g_coeff_summary.__doc__
# GeneralizIT.d_study_summary.__doc__ = Design.d_study_summary.__doc__
# GeneralizIT.confidence_intervals_summary.__doc__ = Design.confidence_intervals_summary.__doc__
# # ---- End of Wrapper Documentation ----