Source code for DiadFit.densimeter_fitting

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import patches
import lmfit
from lmfit.models import GaussianModel, VoigtModel, LinearModel, ConstantModel
from scipy.signal import find_peaks
import os
import re
from os import listdir
from os.path import isfile, join
import pickle
from scipy import stats


encode="ISO-8859-1"

## Files are fit using the folder UCB_cali_fitting_1117_1447

[docs] def calculate_generic_std_err_values(*, pickle_str, new_x, CI=0.67): """ This function loads a model from a pickle file and calculates standard error values based on the model residuals, considering a confidence interval. Parameters ----------------- pickle_str (str) The path to the pickle file containing the model. new_x (np.array): x values for calculation. CI (float, optional): The confidence level for prediction intervals (default: 0.67). Returns: - pandas.DataFrame """ # Load the model and the data from the pickle file with open(pickle_str, 'rb') as f: data = pickle.load(f) model = data['model'] N_poly = model.order - 1 Pf = data['model'] x = data['x'] y = data['y'] # Calculate the residuals residuals = y - Pf(x) # Calculate the standard deviation of the residuals residual_std = np.std(residuals) # Calculate the standard errors for the new x values mean_x = np.mean(x) n = len(x) standard_errors = residual_std * np.sqrt(1 + 1/n + (new_x - mean_x)**2 / np.sum((x - mean_x)**2)) # Calculate the degrees of freedom df = len(x) - (N_poly + 1) # Calculate the t value for the given confidence level t_value = stats.t.ppf((1 + CI) / 2, df) # Calculate the prediction intervals preferred_values = Pf(new_x) lower_values = preferred_values - t_value * standard_errors upper_values = preferred_values + t_value * standard_errors df=pd.DataFrame(data={ 'time': new_x, 'preferred_values': preferred_values, 'lower_values': lower_values, 'upper_values': upper_values }) return df
[docs] def plot_and_save_CO2cali_pickle(*, cali_data, CO2_dens_col='rho',Split_col='Split', split_error='split_err',CO2_dens_error='dens_err', density_range, N_poly=3, CI=0.67, std_error=True, save_fig=False,eq_division='ccmr',save_suffix=''): """ This function calculates and saves three polynomial regression models as pickles for CO2 calibration data according to DeVitre et al. 2021 (low density, mid density and high density). The models are saved as pickles and the plot is saved as well if desired. Parameters ----------------- cali_data (pandas.DataFrame): A DataFrame containing calibration data (should at least contain a density and a fermi splitting column) CO2_dens_col (str, optional): The column name corresponding to CO2 density in the cali_data DataFrame (default: 'rho'). Split_col (str, optional): The column name corresponding to Fermi splitting values in the cali_data DataFrame (default: 'Split'). split_error (str, float, or array-like, optional): The column name corresponding to error in fermi splitting OR an array of splitting errors OR a float (default: 'split_err'). CO2_dens_error (str, float or array-like, optional): The column name corresponding to error in CO2 density OR an array of density errors OR a float(default: 'dens_err'). density_range (str): The density range to be fit ('Low', 'Medium', or 'High'). N_poly (int, optional): The degree of the polynomial fit (default: 3). CI (float, optional): The confidence level for prediction intervals (default: 0.67). std_error (bool, optional): Whether to calculate and plot standard error (default: True). save_fig (bool, optional): Whether to save the plot as an image (default: False). eq_division (str, optional): Method for dividing the data based on density ('ccmr', 'cmass', or 'cmass_24C', default: 'ccmr'). CCMR corresponds to the limits for each section as shown in DeVitre et al., 2021 (Chem. Geo), cmass is for those in DeVitre et al., 2023 (J. Volcanica). save_suffix (str, optional): Suffix to be added to the saved file names (default: ''). Returns: Saves a .pkl file, doesnt return anything. """ # Define the x and y values try: if eq_division=='ccmr': lowcut=0.17 midcut_low=0.12 midcut_high=0.72 highcut=0.65 elif eq_division=='cmass': lowcut=0.20 midcut_low=0.13 midcut_high=0.70 highcut=0.65 elif eq_division=='cmass_24C': lowcut=0.24 midcut_low=0.24 midcut_high=0.65 highcut=0.65 if density_range == 'Low': cali_data = cali_data[cali_data[CO2_dens_col] < lowcut] prefix='Lowrho_' elif density_range == 'Medium': cali_data = cali_data[cali_data[CO2_dens_col].between(midcut_low, midcut_high)] prefix='Mediumrho_' elif density_range == 'High': cali_data = cali_data[cali_data[CO2_dens_col] > highcut] prefix='Highrho_' else: raise ValueError("Invalid density range. Please choose 'Low', 'Medium', or 'High'.") except ValueError as e: print(f"Warning: {e}") return x_all = cali_data[Split_col].values y_all = cali_data[CO2_dens_col].values if not isinstance(split_error,str)==True: x_err=pd.Series(split_error, index=cali_data[CO2_dens_col].index).values else: x_err=cali_data[split_error].values if not isinstance(CO2_dens_error,str)==True: y_err=pd.Series(CO2_dens_error, index=cali_data[CO2_dens_col].index).values else: y_err=cali_data[CO2_dens_error].values non_nan_indices = ~np.isnan(x_all) & ~np.isnan(y_all) # Filter out NaN values x = x_all[non_nan_indices] y = y_all[non_nan_indices] # Perform polynomial regression coefficients = np.polyfit(x, y, N_poly) Pf = np.poly1d(coefficients) # Save the model and the data to a pickle file data = {'model': Pf, 'x': x, 'y': y} with open(prefix+'polyfit_data'+save_suffix+'.pkl', 'wb') as f: pickle.dump(data, f) if std_error is True: new_x_plot = np.linspace(np.min(x), np.max(x), 100) new_calidf = calculate_generic_std_err_values(pickle_str=prefix+'polyfit_data'+save_suffix+'.pkl', new_x=pd.Series(new_x_plot), CI=CI) # Calculate R-squared and p-value residuals = y - Pf(x) ssr = np.sum(residuals**2) sst = np.sum((y - np.mean(y))**2) r_squared = 1 - (ssr / sst) df = len(x) - (N_poly + 1) p_value = 1 - stats.f.cdf(r_squared * (len(x) - 1), N_poly, df) legend_label = f'best fit: R$^2$ = {r_squared:.4f}, p-val = {p_value:.1e}' else: legend_label = 'best fit' # Now lets plot the confidence interval fig, (ax1) = plt.subplots(1, 1, figsize=(10,5)) ax1.errorbar(x, y, xerr=x_err,yerr=y_err, fmt='o', ecolor='grey', elinewidth=0.8, mfc='grey', ms=5, mec='k',capsize=3,barsabove=True) ax1.plot(new_x_plot, new_calidf['preferred_values'], '-k', label=legend_label) # ax1.plot(new_x_plot, new_calidf['lower_values'], ':k', label='lower vals') # ax1.plot(new_x_plot, new_calidf['upper_values'], ':k', label='upper vals') ax1.fill_between(new_x_plot, new_calidf['lower_values'], new_calidf['upper_values'], color='gray', alpha=0.2,zorder=-2,label='Prediction interval') ax1.set_xlabel('Split') ax1.set_ylabel('Density') ax1.legend() ax1.ticklabel_format(useOffset=False) # this sets the ordinal suffix for the polynomial degree in the title if N_poly == 1: suffix = 'st' elif N_poly == 2: suffix = 'nd' elif N_poly == 3: suffix = 'rd' else: suffix='th' ax1.set_title(f"{N_poly}$^{{{suffix}}}$ degree polynomial: "+str(100*CI) + ' % prediction interval') if save_fig is True: fig.savefig(prefix+'cali_line'+save_suffix+'.png')