Source code for generalizit.design_utils

import re
from typing import Optional, Dict, List, Tuple, Union
from itertools import combinations
import pandas as pd


[docs] def match_research_design(input_string: str) -> Union[Tuple[str, List], Tuple[int, List], Tuple[None, List]]: """ Matches a string input of a research design to one of 8 predefined designs. Parameters: input_string (str): The research design input as a string. Valid operators are 'x' for crossing and ':' for nesting. Some designs require parentheses to indicate grouping. Returns: int or str: The design number (1 to 8) that matches the input List: List of facets extracted from the design string Raises: ValueError: If the input string is invalid or malformed TypeError: If the input is not a string Examples: >>> match_research_design("persons x raters") # Design 1 1 >>> match_research_design("items:persons") # Design 2 2 >>> match_research_design("p x i x h") # Design 3 3 >>> match_research_design("p x (i:h)") # Design 4 4 """ # Type checking if not isinstance(input_string, str): raise TypeError("Input must be a string") # Check for empty or whitespace-only input if not input_string.strip(): raise ValueError("Input string cannot be empty") # Define the patterns for each design - ordered from most specific to least specific # The number corresponds to the table in Brennan (2001) designs = { 7: r"^\(.+\s*x\s+.+\)\s*:\s*.+$", # (i x h):p 4: r"^.+\s*x\s*\(.+\s*:\s*.+\)$", # p x (i:h) 5: r"^\(.+\s*:\s*.+\)\s*x\s+.+$", # (i:p) x h 6: r"^.+\s*:\s*\(.+\s*x\s+.+\)$", # i:(p x h) 8: r"^.+\s*:\s*.+\s*:\s*.+$", # i:h:p 2: r"^.+\s*:\s*.+$", # i:p } # Validate characters before normalization valid_chars = set("abcdefghijklmnopqrstuvwxyz0123456789():x_- ") if not all(c.lower() in valid_chars for c in input_string if c not in "\n\t"): raise ValueError("Invalid characters detected. Only letters, numbers, ':', 'x', '(', ')', '_', '-' and spaces are allowed.") # Preprocess the input string to normalize spacing and casing try: normalized_input = re.sub(r"\s+", " ", input_string.strip().lower()) if len(normalized_input) > 100: # Reasonable length limit raise ValueError("Input string is too long") except Exception as e: raise ValueError(f"Error normalizing input string: {str(e)}") # Validate parentheses matching if normalized_input.count("(") != normalized_input.count(")"): raise ValueError("Mismatched parentheses") # Validate operators if ":" in normalized_input and " x " in normalized_input: if "(" not in normalized_input or ")" not in normalized_input: raise ValueError("Invalid input: Parentheses are required for partially crossed designs. See examples.") # Split on operators while preserving the operators def get_facets(input_str: str) -> list: """ Splits input string into facets, properly handling 'x' operator vs 'x' in words. """ # First split on colons colon_parts = input_str.split(':') all_facets = [] for part in colon_parts: # For each part, split on ' x ' (space-x-space) to avoid splitting words containing 'x' x_parts = part.split(' x ') all_facets.extend(x_parts) # Remove parentheses and strip whitespace facets = [term.replace("(", "").replace(")", "").strip() for term in all_facets] return [facet for facet in facets if facet] # Remove empty facets # Count and validate operators colon_count = normalized_input.count(":") cross_count = normalized_input.count(" x ") if colon_count > 2: raise ValueError("Invalid input: More than 2 ':' operators detected.") elif colon_count + cross_count > 2: raise ValueError("Invalid input: More than 2 operators detected.") elif colon_count + cross_count == 0: raise ValueError("Invalid input: No operators detected. Must use ':' or 'x'.") # Check for a fully crossed design elif colon_count == 0 and cross_count > 0: print(f"Fully Crossed Design: {normalized_input}") return 'crossed', get_facets(normalized_input) # Validate no empty facets between operators facets = get_facets(normalized_input) if any(not term.strip() for term in facets): raise ValueError("Invalid input: Empty facets between operators") # Check each design pattern try: for design_number, pattern in designs.items(): if re.match(pattern, normalized_input): return design_number, facets except re.error as e: raise ValueError(f"Error in pattern matching: {str(e)}") # If no match but input seems valid, return None return None, facets
[docs] def validate_research_design(design_number: Optional[int]) -> bool: """ Validates if a design number is valid (1-8). Parameters: design_number: The design number to validate Returns: bool: True if valid, False otherwise """ return isinstance(design_number, int) and 1 <= design_number <= 8 or design_number == 'crossed'
[docs] def create_corollary_dictionary(design_num: int, design_facets: list) -> Dict[str, str]: """ Parse a research design string and return a dictionary mapping facet types (p, i, h) to their actual values based on the design pattern. Parameters: design_num (int): The design number design_facets (list): List of facets extracted from the design string Returns: Dict[str, str]: Dictionary mapping facet names from the research design to base design facets. Keys are 'p', 'i', and/or 'h' Raises: ValueError: If the design pattern is invalid or can't be parsed """ # Initialize facets dictionary corollary_dict = {} try: if design_num == 'crossed': return {f"facet_{i+1}": facet for i, facet in enumerate(design_facets)} elif design_num == 2: # i:p corollary_dict['i'] = design_facets[0] corollary_dict['p'] = design_facets[1] elif design_num == 8: # i:h:p corollary_dict['i'] = design_facets[0] corollary_dict['h'] = design_facets[1] corollary_dict['p'] = design_facets[2] elif design_num == 6: # i:(p x h) corollary_dict['i'] = design_facets[0] corollary_dict['p'] = design_facets[1] corollary_dict['h'] = design_facets[2] elif design_num == 5: # (i:p) x h corollary_dict['i'] = design_facets[0] corollary_dict['p'] = design_facets[1] corollary_dict['h'] = design_facets[2] elif design_num == 4: # p x (i:h) corollary_dict['p'] = design_facets[0] corollary_dict['i'] = design_facets[1] corollary_dict['h'] = design_facets[2] elif design_num == 7: # (i x h):p corollary_dict['i'] = design_facets[0] corollary_dict['h'] = design_facets[1] corollary_dict['p'] = design_facets[2] # Verify all required facets are present required_facets = ['i', 'p'] if design_num >= 3: # Designs 3-8 require all three facets required_facets.append('h') missing_facets = [f for f in required_facets if f not in corollary_dict] if missing_facets: raise ValueError(f"Missing required facets: {missing_facets}") except Exception as e: raise ValueError(f"Error parsing facets: {str(e)}") return corollary_dict
[docs] def create_variance_tuple_dictionary( design_num: Union[int, str], corollary_dict: Dict[str, str] ) -> Dict[str, tuple]: """ Create a variance tuple dictionary for a given study design structure. This function constructs a dictionary representing variance components for a given study design, for example Brennan Design 2 `(i:p)`, where `p` and `i` represent facets of variation. The dictionary maps each variance component to a tuple of relevant facets. Parameters: design_num (int): The design number (2,4-8) or 'crossed' for fully crossed designs. corollary_dict (dict): A dictionary mapping corollary names to actual facet names. For example, {'p': 'persons', 'i': 'items'}. ``` Returns: dict: A dictionary where keys are variance component names (strings), and values are tuples of the corresponding facets. Example: >>> create_variance_tuple_dictionary(2, {'p': 'persons', 'i': 'items'}) { 'p': ('p',), 'i:p': ('i', 'p'), 'mean': () } """ variance_tuple_dictionary = {} try: if design_num == 'crossed': # each facet should be a separate variance component and crossed # get all possible combinations of facets for j in range(1, len(corollary_dict) + 1): for combo in combinations(corollary_dict.values(), j): variance_tuple_dictionary[' x '.join(combo)] = combo elif design_num == 2: # i:p variance_tuple_dictionary[corollary_dict['p']] = (corollary_dict['p'],) variance_tuple_dictionary[f'{corollary_dict["i"]}:{corollary_dict["p"]}'] = (corollary_dict['i'], corollary_dict['p']) elif design_num == 8: # i:h:p variance_tuple_dictionary[corollary_dict['p']] = (corollary_dict['p'],) variance_tuple_dictionary[f'{corollary_dict["h"]}:{corollary_dict["p"]}'] = (corollary_dict['h'], corollary_dict['p']) variance_tuple_dictionary[f'{corollary_dict["i"]}:{corollary_dict["h"]}:{corollary_dict["p"]}'] = (corollary_dict['i'], corollary_dict['h'], corollary_dict['p']) elif design_num == 6: # i:(p x h) variance_tuple_dictionary[corollary_dict['p']] = (corollary_dict['p'],) variance_tuple_dictionary[corollary_dict['h']] = (corollary_dict['h'],) variance_tuple_dictionary[f'{corollary_dict["p"]} x {corollary_dict["h"]}'] = (corollary_dict['p'], corollary_dict['h']) variance_tuple_dictionary[f'{corollary_dict["i"]}:({corollary_dict["p"]} x {corollary_dict["h"]})'] = (corollary_dict['i'], corollary_dict['p'], corollary_dict['h']) elif design_num == 5: # (i:p) x h variance_tuple_dictionary[corollary_dict['p']] = (corollary_dict['p'],) variance_tuple_dictionary[corollary_dict['h']] = (corollary_dict['h'],) variance_tuple_dictionary[f'{corollary_dict["i"]}:{corollary_dict["p"]}'] = (corollary_dict['i'], corollary_dict['p']) variance_tuple_dictionary[f'{corollary_dict["p"]} x {corollary_dict["h"]}'] = (corollary_dict['p'], corollary_dict['h']) variance_tuple_dictionary[f'({corollary_dict["i"]}:{corollary_dict["p"]}) x {corollary_dict["h"]}'] = (corollary_dict['i'], corollary_dict['p'], corollary_dict['h']) elif design_num == 4: # p x (i:h) variance_tuple_dictionary[corollary_dict['p']] = (corollary_dict['p'],) variance_tuple_dictionary[corollary_dict['h']] = (corollary_dict['h'],) variance_tuple_dictionary[f'{corollary_dict["i"]}:{corollary_dict["h"]}'] = (corollary_dict['i'], corollary_dict['h']) variance_tuple_dictionary[f'{corollary_dict["p"]} x {corollary_dict["h"]}'] = (corollary_dict['p'], corollary_dict['h']) variance_tuple_dictionary[f'{corollary_dict["p"]} x ({corollary_dict["i"]}:{corollary_dict["h"]})'] = (corollary_dict['p'], corollary_dict['i'], corollary_dict['h']) elif design_num == 7: # (i x h):p variance_tuple_dictionary[corollary_dict['p']] = (corollary_dict['p'],) variance_tuple_dictionary[f'{corollary_dict["i"]}:{corollary_dict["p"]}'] = (corollary_dict['i'], corollary_dict['p']) variance_tuple_dictionary[f'{corollary_dict["h"]}:{corollary_dict["p"]}'] = (corollary_dict['h'], corollary_dict['p']) variance_tuple_dictionary[f'({corollary_dict["i"]} x {corollary_dict["h"]}):{corollary_dict["p"]}'] = (corollary_dict['i'], corollary_dict['h'], corollary_dict['p']) # Add the mean variance component variance_tuple_dictionary['mean'] = () except Exception as e: raise ValueError(f"Error creating variance tuple dictionary: {str(e)}") return variance_tuple_dictionary
[docs] def parse_facets(design_num: Union[int, str], design_facets: list) -> Dict[str, tuple]: """ Parses the facets of a research design and returns a dictionary of variance components. This function combines the functionality of `create_corollary_dictionary` and `create_variance_tuple_dictionary` to generate a comprehensive dictionary of variance components for a given research design. Parameters: design_num (Union[int, str]): The design number or 'crossed' for fully crossed designs. design_facets (list): List of facets extracted from the design string. Returns: Dict[str, tuple]: A dictionary where keys are variance component names (strings), and values are tuples of the corresponding facets. Raises: ValueError: If the design pattern is invalid or can't be parsed. Example: >>> parse_facets(2, ['items', 'persons']) { 'p': ('persons',), 'i:p': ('items', 'persons'), 'mean': () } """ corollary_dict = create_corollary_dictionary(design_num=design_num, design_facets=design_facets) variance_tuple_dict = create_variance_tuple_dictionary(design_num=design_num, corollary_dict=corollary_dict) return variance_tuple_dict
[docs] def get_facets_from_variance_tuple_dictionary( variance_tuple_dict: Dict[str, tuple] ) -> List[str]: """ Extracts the facets from a variance tuple dictionary. Parameters: variance_tuple_dict (Dict[str, tuple]): A dictionary where keys are variance component names (strings), and values are tuples of the corresponding facets. Returns: List[str]: A list of unique facets extracted from the variance tuple dictionary. """ facets = set() for key, value in variance_tuple_dict.items(): if key != 'mean': facets.update(value) return list(facets)