Source code for neer_match_utilities.prepare

import re
import numpy as np
import pandas as pd
from collections import OrderedDict
from .base import SuperClass

[docs] class Prepare(SuperClass): """ A class for preparing and processing data based on similarity mappings. The Prepare class inherits from SuperClass and provides functionality to clean, preprocess, and align two pandas DataFrames (`df_left` and `df_right`) based on a given similarity map. This is useful for data cleaning and ensuring data compatibility before comparison or matching operations. Attributes: ----------- similarity_map : dict A dictionary defining column mappings between the left and right DataFrames. df_left : pandas.DataFrame The left DataFrame to be processed. df_right : pandas.DataFrame The right DataFrame to be processed. id_left : str Column name representing unique IDs in the left DataFrame. id_right : str Column name representing unique IDs in the right DataFrame. """
[docs] def format(self, fill_numeric_na: bool = False, to_numeric: list = [], fill_string_na: bool = False, capitalize: bool = False): """ Cleans, processes, and aligns the columns of two DataFrames (`df_left` and `df_right`). This method applies transformations based on column mappings defined in `similarity_map`. It handles numeric and string conversions, fills missing values, and ensures consistent data types between the columns of the two DataFrames. Parameters ---------- fill_numeric_na : bool, optional If True, fills missing numeric values with `0` before conversion to numeric dtype. Default is False. to_numeric : list, optional A list of column names to be converted to numeric dtype. Default is an empty list. fill_string_na : bool, optional If True, fills missing string values with empty strings. Default is False. capitalize : bool, optional If True, capitalizes string values in non-numeric columns. Default is False. Returns ------- tuple[pandas.DataFrame, pandas.DataFrame] A tuple containing the processed left (`df_left_processed`) and right (`df_right_processed`) DataFrames. Notes ----- - Columns are processed and aligned according to the `similarity_map`: - If both columns are numeric, their types are aligned. - If types differ, columns are converted to strings while preserving `NaN`. - Supports flexible handling of missing values and type conversions. """ def process_df(df, columns, id_column): """ Clean and process a DataFrame based on specified columns and an ID column. This function performs a series of cleaning and transformation steps on a DataFrame, including renaming columns, handling missing values, converting data types, and optionally capitalizing strings. Parameters ---------- df : pd.DataFrame The DataFrame to process. columns : list of str A list of column names to be processed. id_column : str The name of the ID column to retain in the DataFrame. Returns ------- pd.DataFrame A cleaned and processed DataFrame. Notes ----- - Columns specified in `to_numeric` are converted to numeric dtype after removing non-numeric characters and optionally filling missing values. - Non-numeric columns are converted to strings, with missing values optionally replaced by empty strings or left as NaN. - If `capitalize` is True, string columns are converted to uppercase. """ # Select and rename relevant columns df = df[ [id_column] + [ re.sub(r'\s', '', col) for col in columns ] ].copy() # Dtype for col in columns: # Convert to numeric if included in to_numeric argument if col in to_numeric: # remove non-numeric characters df[col] = df[col].astype(str).str.replace(r'[^\d\.]','', regex=True) # fill NaNs with 0 if specified if fill_numeric_na == True: df[col] = df[col].replace(r'','0',regex=True) # convert to numeric dtype df[col] = pd.to_numeric(df[col], errors='coerce') # If not, convert to string while replacing nans with empty strings else: if fill_string_na == True: df[col] = df[col].fillna('').astype(str) else: df[col] = df[col].fillna(np.nan) # Capitalize if wished if capitalize == True: for col in columns: if not col in to_numeric: df[col] = df[col].str.upper() return df # Prepare columns for both DataFrames columns_left = list(OrderedDict.fromkeys([ key.split('~')[0] if '~' in key else key for key in self.similarity_map ])) columns_right = list(OrderedDict.fromkeys([ key.split('~')[1] if '~' in key else key for key in self.similarity_map ])) # Process both DataFrames df_left_processed = process_df(self.df_left, columns_left, self.id_left) df_right_processed = process_df(self.df_right, columns_right, self.id_right) # Ensure matched columns have the same dtype for key in self.similarity_map: cl, cr = (key.split('~') + [key])[:2] # Handles both cases where '~' exists or not if df_left_processed[cl].dtype != df_right_processed[cr].dtype: # Check if both are numeric if pd.api.types.is_numeric_dtype(df_left_processed[cl]) and pd.api.types.is_numeric_dtype(df_right_processed[cr]): # Align numeric types (e.g., float over int if needed) if pd.api.types.is_integer_dtype(df_left_processed[cl]) and pd.api.types.is_float_dtype(df_right_processed[cr]): df_left_processed[cl] = df_left_processed[cl].astype(float) elif pd.api.types.is_float_dtype(df_left_processed[cl]) and pd.api.types.is_integer_dtype(df_right_processed[cr]): df_right_processed[cr] = df_right_processed[cr].astype(float) # Both are numeric and no conversion needed beyond alignment else: # Convert both to string if types don't match df_left_processed[cl] = df_left_processed[cl].apply(lambda x: str(x) if pd.notna(x) else x) df_right_processed[cr] = df_right_processed[cr].apply(lambda x: str(x) if pd.notna(x) else x) return df_left_processed, df_right_processed
[docs] def similarity_map_to_dict(items: list) -> dict: """ Convert a list of similarity mappings into a dictionary representation. The function accepts a list of tuples, where each tuple represents a mapping with the form `(left, right, similarity)`. If the left and right column names are identical, the dictionary key is that column name; otherwise, the key is formed as `left~right`. Returns ------- dict A dictionary where keys are column names (or `left~right` for differing columns) and values are lists of similarity functions associated with those columns. """ result = {} for left, right, similarity in items: # Use the left value as key if both columns are identical; otherwise, use 'left~right' key = left if left == right else f"{left}~{right}" if key in result: result[key].append(similarity) else: result[key] = [similarity] return result