Attribution 4.0 (CC BY 4.0)https://creativecommons.org/licenses/by/4.0/
License information was derived automatically
In the Zip, spectral. npy was the average spectral data of red ginseng, mycotoxins and interference impurities, and label. npy was the corresponding label. Spectral data format was [1200,510] and label data format was [1200,1]. The example of data usage (sklearn in Python database was used to establish the classification model) was as follows:
import numpy as np
from sklearn. model_selection import train_test_split
from sklearn. preprocessing import StandardScaler
from sklearn. neighbors import KNeighborsClassifier
from sklearn. metrics import classification_report, accuracy_score
# Load spectral data and labels
x = np.load('.../spectral.npy')[:,1:-1]
y = np.load('.../label.npy')
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
# Data standardization
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
# Train the KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model. fit(x_train, y_train)
# Predict
y_pred = knn_model.predict(x_test)
# Print classification reports and accuracy rates
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:")
print(accuracy_score(y_test, y_pred))
import pandas as pd import numpy as np
PERFORMING EDA
data.head() data.info()
attributes_data = data.iloc[:, 1:] attributes_data
attributes_data.describe() attributes_data.corr()
import seaborn as sns import matplotlib.pyplot as plt
correlation_matrix = attributes_data.corr() plt.figure(figsize=(18, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm') plt.show()
CHECKING IF DATASET IS LINEAR OR NON-LINEAR
correlations = data.corr()['Diabetes_binary'].drop('Diabetes_binary')
plt.figure(figsize=(10, 6)) correlations.plot(kind='bar') plt.xlabel('Predictor Columns') plt.ylabel('Correlation values') plt.title('Correlation between Diabetes_binary and Predictors') plt.show()
CHECKING FOR NULL AND MISSING VALUES, CLEANING THEM
print(data.isnull().sum())
print(data.isna().sum())
LASSO import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.linear_model import Lasso from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV, KFold
X = data.iloc[:, 1:] y = data.iloc[:, 0] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
parameters = {"alpha": np.arange(0.00001, 10, 500)}
kfold = KFold(n_splits = 10, shuffle=True, random_state = 42)
lassoReg = Lasso()
lasso_cv = GridSearchCV(lassoReg, param_grid = parameters, cv = kfold)
lasso_cv.fit(X, y)
print("Best Params {}".format(lasso_cv.best_params_))
column_names = list(data) column_names = column_names[1:] column_names
lassoModel = Lasso(alpha = 0.00001) lassoModel.fit(X_train, y_train) lasso_coeff = np.abs(lassoModel.coef_)#making all coefficients positive plt.bar(column_names, lasso_coeff, color = 'orange') plt.xticks(rotation=90) plt.grid() plt.title("Feature Selection Based on Lasso") plt.xlabel("Features") plt.ylabel("Importance") plt.ylim(0, 0.16) plt.show()
RFE from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
from sklearn.feature_selection import RFECV from sklearn.tree import DecisionTreeClassifier model = DecisionTreeClassifier() rfecv = RFECV(estimator= model, step = 1, cv = 20, scoring="accuracy") rfecv = rfecv.fit(X_train, y_train)
num_features_selected = len(rfecv.rankin_)
cv_scores = rfecv.ranking_
plt.figure(figsize=(10, 6)) plt.xlabel("Number of features selected") plt.ylabel("Score (accuracy)") plt.plot(range(1, num_features_selected + 1), cv_scores, marker='o', color='r') plt.xticks(range(1, num_features_selected + 1)) # Set x-ticks to integers plt.grid() plt.title("RFECV: Number of Features vs. Score(accuracy)") plt.show()
print("The optimal number of features:", rfecv.n_features_) print("Best features:", X_train.columns[rfecv.support_])
PCA import pandas as pd import numpy as np import matplotlib.pyplot as plt %matplotlib inline from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler
X = data.drop(["Diabetes_binary"], axis=1) y = data["Diabetes_binary"]
df1=pd.DataFrame(data = data,columns=data.columns) print(df1)
scaling=StandardScaler() scaling.fit(df1) Scaled_data=scaling.transform(df1) principal=PCA(n_components=3) principal.fit(Scaled_data) x=principal.transform(Scaled_data) print(x.shape)
principal.components_
plt.scatter(x[:,0],x[:,1],c=data['Diabetes_binary'],cmap='plasma') plt.xlabel('pc1') plt.ylabel('pc2')
print(principal.explained_variance_ratio_)
T-SNE from sklearn.manifold import TSNE from numpy import reshape import seaborn as sns
tsne = TSNE(n_components=3, verbose=1, random_state=42) z = tsne.fit_transform(X)
df = pd.DataFrame() df["y"] = y df["comp-1"] = z[:,0] df["comp-2"] = z[:,1] df["comp-3"] = z[:,2] sns.scatterplot(x="comp-1", y="comp-2", hue=df.y.tolist(), palette=sns.color_palette("husl", 2), data=df).set(title="Diabetes data T-SNE projection")
Attribution 4.0 (CC BY 4.0)https://creativecommons.org/licenses/by/4.0/
License information was derived automatically
Notaries, Property Institutions, and the Civil Law Trap: Evidence from Global IPRI Data
This project examines the relationship between national notarial systems and institutional quality across countries using data from the International Property Rights Index (IPRI). We analyze how different legal traditions—particularly civil law systems with Latin Notariat models—affect property rights enforcement, regulatory quality, and corporate governance.
By applying Principal Component Analysis (PCA) to 13 institutional variables from the IPRI dataset, we derive three dimensions of governance:
Property Rights and Enforcement
Political Governance and Rule of Law
Registration and Financing Access
We then use logistic regression to estimate the probability that a country uses a specific notarial system (e.g., Latin Notariat) based on these principal components. The findings underscore how legal traditions shape institutional outcomes and help explain the persistence of economically inefficient practices like monopolistic notary regimes.
This work contributes to law-and-economics scholarship, legal origins theory, and institutional reform debates, with practical implications for international development, business law harmonization, and civil code modernization.
# Step 1: Install & import libraries
import pandas as pd
import statsmodels.api as sm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from google.colab import files
# Step 2: Upload CSV
uploaded = files.upload()
# Step 3: Load CSV
df = pd.read_csv('IPRI_with_Notary_Systems.csv')
# Step 4: Rename IPRI columns
rename_dict = {
'Legal and Political Environment (LP)': 'LP',
'Judicial Independence': 'Judicial_Independence',
'Rule of Law': 'Rule_of_Law',
'Political Stability': 'Political_Stability',
'Control of Corruption': 'Control_of_Corruption',
'Physical Property Rights (PPR)': 'PPR',
'Physical Property Protection': 'Physical_Property_Protection',
'Registering Process': 'Registering_Process',
'Access to Financing': 'Access_to_Financing',
'Intellectual Property Rights (IPR)': 'IPR',
'Intellectual Property Protection': 'Intellectual_Property_Protection',
'Patent Protection': 'Patent_Protection',
'Copyright Protection': 'Copyright_Protection',
'Trademark Protection': 'Trademark_Protection'
}
df.rename(columns=rename_dict, inplace=True)
ipri_vars = list(rename_dict.values())
# Step 5: Drop missing values
df.dropna(subset=['Notary System'] + ipri_vars, inplace=True)
# Step 6: Create dummy variables for notary systems
df['Notary_System_Clean'] = df['Notary System'].str.replace(" ", "_")
notary_dummies = pd.get_dummies(df['Notary_System_Clean'], prefix='Notary', drop_first=False)
# Step 7: Standardize IPRI variables for PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[ipri_vars])
# Step 8: Run PCA
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)
df[['PC1', 'PC2', 'PC3']] = X_pca
# Step 9: Interpret principal components by top loadings
pca_loadings = pd.DataFrame(pca.components_.T, columns=['PC1', 'PC2', 'PC3'], index=ipri_vars)
top_features = {pc: pca_loadings[pc].abs().sort_values(ascending=False).head(3).index.tolist() for pc in ['PC1', 'PC2', 'PC3']}
pc_names = {
'PC1': f"Property Rights and Enforcement ({', '.join(top_features['PC1'])})",
'PC2': f"Political Governance and Rule of Law ({', '.join(top_features['PC2'])})",
'PC3': f"Registration and Financing Access ({', '.join(top_features['PC3'])})"
}
# Step 10: Run logistic regression for each notary system
logit_results = {}
for notary_type in notary_dummies.columns:
df['target'] = notary_dummies[notary_type].astype(int)
X = sm.add_constant(df[['PC1', 'PC2', 'PC3']])
y = df['target']
model = sm.Logit(y, X).fit(disp=0)
logit_results[notary_type] = model.summary()
# Step 11: Print interpretation of PCs
print("🔍 Principal Component Names and Top Influences:")
for pc, name in pc_names.items():
print(f"{pc}: {name}")
# Step 12: Print regression summary for one system
print("
📊 Logistic Regression: Civil_Law_(Latin)")
print(logit_results.get('Notary_Civil_Law_(Latin)', "Notary type not found."))
Creative Commons Attribution 4.0 International (CC BY 4.0)
Let me know if you would like me to generate the README.md
, metadata.json
, or help you upload the full dataset + notebook to Zenodo.
Not seeing a result you expected?
Learn how you can add new datasets to our index.
Attribution 4.0 (CC BY 4.0)https://creativecommons.org/licenses/by/4.0/
License information was derived automatically
In the Zip, spectral. npy was the average spectral data of red ginseng, mycotoxins and interference impurities, and label. npy was the corresponding label. Spectral data format was [1200,510] and label data format was [1200,1]. The example of data usage (sklearn in Python database was used to establish the classification model) was as follows:
import numpy as np
from sklearn. model_selection import train_test_split
from sklearn. preprocessing import StandardScaler
from sklearn. neighbors import KNeighborsClassifier
from sklearn. metrics import classification_report, accuracy_score
# Load spectral data and labels
x = np.load('.../spectral.npy')[:,1:-1]
y = np.load('.../label.npy')
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
# Data standardization
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
# Train the KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model. fit(x_train, y_train)
# Predict
y_pred = knn_model.predict(x_test)
# Print classification reports and accuracy rates
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:")
print(accuracy_score(y_test, y_pred))