Facebook
TwitterAttribution 4.0 (CC BY 4.0)https://creativecommons.org/licenses/by/4.0/
License information was derived automatically
The data sets are used in a controlled experiment, where two classifiers should be compared. train_a.csv and explain.csv are slices from the original data set. train_b.csv contains the same instances as in train_a.csv, but with feature x1 set to 0 to make it unusable to classifier B.
The original data set was created and split using this Python code:
from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression
X, y = make_classification(n_samples=300, n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=1, class_sep=0.75, random_state=0) X *= 100
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) lm = LogisticRegression() lm.fit(X_train, y_train) clf_a = lm
clf_b = LogisticRegression() X2 = X.copy() X2[:, 0] = 0 X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, test_size=0.5, random_state=0) clf_b.fit(X2_train, y2_train)
X_explain = X_test y_explain = y_test
Facebook
Twitterimport pandas as pd import numpy as np
PERFORMING EDA
data.head() data.info()
attributes_data = data.iloc[:, 1:] attributes_data
attributes_data.describe() attributes_data.corr()
import seaborn as sns import matplotlib.pyplot as plt
correlation_matrix = attributes_data.corr() plt.figure(figsize=(18, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm') plt.show()
CHECKING IF DATASET IS LINEAR OR NON-LINEAR
correlations = data.corr()['Diabetes_binary'].drop('Diabetes_binary')
plt.figure(figsize=(10, 6)) correlations.plot(kind='bar') plt.xlabel('Predictor Columns') plt.ylabel('Correlation values') plt.title('Correlation between Diabetes_binary and Predictors') plt.show()
CHECKING FOR NULL AND MISSING VALUES, CLEANING THEM
print(data.isnull().sum())
print(data.isna().sum())
LASSO import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.linear_model import Lasso from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV, KFold
X = data.iloc[:, 1:] y = data.iloc[:, 0] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
parameters = {"alpha": np.arange(0.00001, 10, 500)}
kfold = KFold(n_splits = 10, shuffle=True, random_state = 42)
lassoReg = Lasso()
lasso_cv = GridSearchCV(lassoReg, param_grid = parameters, cv = kfold)
lasso_cv.fit(X, y)
print("Best Params {}".format(lasso_cv.best_params_))
column_names = list(data) column_names = column_names[1:] column_names
lassoModel = Lasso(alpha = 0.00001) lassoModel.fit(X_train, y_train) lasso_coeff = np.abs(lassoModel.coef_)#making all coefficients positive plt.bar(column_names, lasso_coeff, color = 'orange') plt.xticks(rotation=90) plt.grid() plt.title("Feature Selection Based on Lasso") plt.xlabel("Features") plt.ylabel("Importance") plt.ylim(0, 0.16) plt.show()
RFE from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
from sklearn.feature_selection import RFECV from sklearn.tree import DecisionTreeClassifier model = DecisionTreeClassifier() rfecv = RFECV(estimator= model, step = 1, cv = 20, scoring="accuracy") rfecv = rfecv.fit(X_train, y_train)
num_features_selected = len(rfecv.rankin_)
cv_scores = rfecv.ranking_
plt.figure(figsize=(10, 6)) plt.xlabel("Number of features selected") plt.ylabel("Score (accuracy)") plt.plot(range(1, num_features_selected + 1), cv_scores, marker='o', color='r') plt.xticks(range(1, num_features_selected + 1)) # Set x-ticks to integers plt.grid() plt.title("RFECV: Number of Features vs. Score(accuracy)") plt.show()
print("The optimal number of features:", rfecv.n_features_) print("Best features:", X_train.columns[rfecv.support_])
PCA import pandas as pd import numpy as np import matplotlib.pyplot as plt %matplotlib inline from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler
X = data.drop(["Diabetes_binary"], axis=1) y = data["Diabetes_binary"]
df1=pd.DataFrame(data = data,columns=data.columns) print(df1)
scaling=StandardScaler() scaling.fit(df1) Scaled_data=scaling.transform(df1) principal=PCA(n_components=3) principal.fit(Scaled_data) x=principal.transform(Scaled_data) print(x.shape)
principal.components_
plt.scatter(x[:,0],x[:,1],c=data['Diabetes_binary'],cmap='plasma') plt.xlabel('pc1') plt.ylabel('pc2')
print(principal.explained_variance_ratio_)
T-SNE from sklearn.manifold import TSNE from numpy import reshape import seaborn as sns
tsne = TSNE(n_components=3, verbose=1, random_state=42) z = tsne.fit_transform(X)
df = pd.DataFrame() df["y"] = y df["comp-1"] = z[:,0] df["comp-2"] = z[:,1] df["comp-3"] = z[:,2] sns.scatterplot(x="comp-1", y="comp-2", hue=df.y.tolist(), palette=sns.color_palette("husl", 2), data=df).set(title="Diabetes data T-SNE projection")
Facebook
TwitterPrediction of Phakic Intraocular Lens Vault Using Machine Learning of Anterior Segment Optical Coherence Tomography Metrics. Authors: Kazutaka Kamiya, MD, PhD, Ik Hee Ryu, MD, MS, Tae Keun Yoo, MD, Jung Sub Kim MD, In Sik Lee, MD, PhD, Jin Kook Kim MD, Wakako Ando CO, Nobuyuki Shoji, MD, PhD, Tomofusa, Yamauchi, MD, PhD, Hitoshi Tabuchi, MD, PhD.
We hypothesize that machine learning of preoperative biometric data obtained by the As-OCT may be clinically beneficial for predicting the actual ICL vault. Therefore, we built the machine learning model using Random Forest to predict ICL vault after surgery.
This multicenter study comprised one thousand seven hundred forty-five eyes of 1745 consecutive patients (656 men and 1089 women), who underwent EVO ICL implantation (V4c and V5 Visian ICL with KS-AquaPORT) for the correction of moderate to high myopia and myopic astigmatism, and who completed at least a 1-month follow-up, at Kitasato University Hospital (Kanagawa, Japan), or at B&VIIT Eye Center (Seoul, Korea).
This data file (RFR_model(feature=12).mat) is the final trained random forest model for MATLAB 2020a.
Python version:
from sklearn.model_selection import train_test_split import pandas as pd import numpy as np from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestRegressor
from google.colab import auth auth.authenticate_user() from google.colab import drive drive.mount('/content/gdrive')
dataset = pd.read_csv('gdrive/My Drive/ICL/data_icl.csv') dataset.head()
y = dataset['Vault_1M'] X = dataset.drop(['Vault_1M'], axis = 1)
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=0)
parameters = {'bootstrap': True, 'min_samples_leaf': 3, 'n_estimators': 500, 'criterion': 'mae' 'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6, 'max_leaf_nodes': None}
RF_model = RandomForestRegressor(**parameters) RF_model.fit(train_X, train_y) RF_predictions = RF_model.predict(test_X) importance = RF_model.feature_importances_
Facebook
Twittertraining Code ```Python
from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split import os import pandas as pd import numpy as np os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" TEMP_DIR = "tmp" os.makedirs(TEMP_DIR, exist_ok=True) train = pd.read_csv('input/map-charting-student-math-misunderstandings/train.csv')
train.Misconception = train.Misconception.fillna('NA')
train['target'] = train.Category + ":" + train.Misconception
le = LabelEncoder() train['label'] = le.fit_transform(train['target']) n_classes = len(le.classes_) # Number of unique target classes print(f"Train shape: {train.shape} with {n_classes} target classes") print("Train head:") train.head()
idx = train.apply(lambda row: row.Category.split('_')[0], axis=1) == 'True' correct = train.loc[idx].copy() correct['c'] = correct.groupby(['QuestionId', 'MC_Answer']).MC_Answer.transform('count') correct = correct.sort_values('c', ascending=False) correct = correct.drop_duplicates(['QuestionId']) correct = correct[['QuestionId', 'MC_Answer']] correct['is_correct'] = 1 # Mark these as correct answers
train = train.merge(correct, on=['QuestionId', 'MC_Answer'], how='left') train.is_correct = train.is_correct.fillna(0)
from transformers import AutoTokenizer, AutoModelForSequenceClassification import torch
Model_Name = "unsloth/Meta-Llama-3.1-8B-Instruct"
model = AutoModelForSequenceClassification.from_pretrained(Model_Name, num_labels=n_classes, torch_dtype=torch.bfloat16, device_map="balanced", cache_dir=TEMP_DIR)
tokenizer = AutoTokenizer.from_pretrained(Model_Name, cache_dir=TEMP_DIR)
def format_input(row): x = "Yes" if not row['is_correct']: x = "No" return ( f"Question: {row['QuestionText']} " f"Answer: {row['MC_Answer']} " f"Correct? {x} " f"Student Explanation: {row['StudentExplanation']}" )
train['text'] = train.apply(format_input,axis=1) print("Example prompt for our LLM:") print() print( train.text.values[0] )
from datasets import Dataset
COLS = ['text', 'label']
train_df_clean = train[COLS].copy() # Use 'train' instead of 'train_df'
train_df_clean['label'] = train_df_clean['label'].astype(np.int64)
train_df_clean = train_df_clean.reset_index(drop=True)
train_ds = Dataset.from_pandas(train_df_clean, preserve_index=False)
def tokenize(batch): """Tokenizes a batch of text inputs.""" return tokenizer(batch["text"], truncation=True, max_length=256)
train_ds = train_ds.map(tokenize, batched=True, remove_columns=['text'])
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id
import os from huggingface_hub import scan_cache_dir
cache_info = scan_cache_dir() cache_info.delete_revisions(*[repo.revisions for repo in cache_info.repos]).execute()
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding import tempfile import shutil
os.makedirs(f"{TEMP_DIR}/training_output/", exist_ok=True) os.makedirs(f"{TEMP_DIR}/logs/", exist_ok=True)
training_args = TrainingArguments(
output_dir=f"{TEMP_DIR}/training_output/",
do_train=True,
do_eval=False,
save_strategy="no",
num_train_epochs=3,
per_device_train_batch_size=16,
learning_rate=5e-5,
logging_dir=f"{TEMP_DIR}/logs/",
logging_steps=500,
bf16=True,
fp16=False,
report_to="none",
warmup_ratio=0.1,
lr_scheduler_type="cosine",
dataloader_pin_memory=False,
gradient_checkpointing=True,
)
def compute_map3(eval_pred): """ Computes Mean Average Precision at 3 (MAP@3) for evaluation. """ logits, labels = eval_pred probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
# Get top 3 predicted class indi...
Facebook
TwitterAttribution 4.0 (CC BY 4.0)https://creativecommons.org/licenses/by/4.0/
License information was derived automatically
🔒 Collection of Privacy-Sensitive Conversations between Care Workers and Care Home Residents in an Residential Care Home 🔒
The dataset is useful to train and evaluate models to identify and classify privacy-sensitive parts of conversations from text, especially in the context of AI assistants and LLMs.
The provided data format is .jsonl, the JSON Lines text format, also called newline-delimited JSON. An example entry looks as follows.
{ "text": "CW: Have you ever been to Italy? CR: Oh, yes... many years ago.", "taxonomy": 0, "category": 0, "affected_speaker": 1, "language": "en", "locale": "US", "data_type": 1, "uid": 16, "split": "train" }
The data fields are:
text: a string feature. The abbreviaton of the speakers refer to the care worker (CW) and the care recipient (CR).taxonomy: a classification label, with possible values including informational (0), invasion (1), collection (2), processing (3), dissemination (4), physical (5), personal-space (6), territoriality (7), intrusion (8), obtrusion (9), contamination (10), modesty (11), psychological (12), interrogation (13), psychological-distance (14), social (15), association (16), crowding-isolation (17), public-gaze (18), solitude (19), intimacy (20), anonymity (21), reserve (22). The taxonomy is derived from Rueben et al. (2017). The classifications were manually labeled by an expert.category: a classification label, with possible values including personal-information (0), family (1), health (2), thoughts (3), values (4), acquaintance (5), appointment (6). The privacy category affected in the conversation. The classifications were manually labeled by an expert.affected_speaker: a classification label, with possible values including care-worker (0), care-recipient (1), other (2), both (3). The speaker whose privacy is impacted during the conversation. The classifications were manually labeled by an expert.language: a string feature. Language code as defined by ISO 639.locale: a string feature. Regional code as defined by ISO 3166-1 alpha-2.data_type: a string a classification label, with possible values including real (0), synthetic (1).uid: a int64 feature. A unique identifier within the dataset.split: a string feature. Either train, validation or test.The dataset has 2 subsets:
split: with a total of 95 examples split into train, validation and test (70%-15%-15%)unsplit: with a total of 95 examples in a single train split| name | train | validation | test |
|---|---|---|---|
| split | 66 | 14 | 15 |
| unsplit | 95 | n/a | n/a |
The files follow the naming convention subset-split-language.jsonl. The following files are contained in the dataset:
split-train-en.jsonlsplit-validation-en.jsonlsplit-test-en.jsonlunsplit-train-en.jsonlRecording audio of care workers and residents during care interactions, which includes partial and full body washing, giving of medication, as well as wound care, is a highly privacy-sensitive use case. Therefore, a dataset is created, which includes privacy-sensitive parts of conversations, synthesized from real-world data. This dataset serves as a basis for fine-tuning a local LLM to highlight and classify privacy-sensitive sections of transcripts created in care interactions, to further mask them to protect privacy.
The intial data was collected in the project Caring Robots of TU Wien in cooperation with Caritas Wien. One project track aims to facilitate Large Languge Models (LLM) to support documentation of care workers, with LLM-generated summaries of audio recordings of interactions between care workers and care home residents. The initial data are the transcriptions of those care interactions.
The transcriptions were thoroughly reviewed, and sections containing privacy-sensitive information were identified and marked using qualitative data analysis software by two experts. Subsequently, the accessible portions of the interviews were translated from German to US English using the locally executed LLM icky/translate. In the next step, another llama3.1:70b was used locally to synthesize the conversation segments. This process involved generating similar, yet distinct and new, conversations that are not linked to the original data. The dataset was split using the train_test_split function from the <a href="https://scikit-learn.org/1.5/modules/generated/sklearn.model_selection.train_test_split.html" target="_blank"
Facebook
Twitterhttps://creativecommons.org/publicdomain/zero/1.0/https://creativecommons.org/publicdomain/zero/1.0/
import numpy as np import pandas as pd import matplotlib.pyplot as plt
dataset = pd.read_csv('Salary_dataset.csv') X = dataset.iloc[:, 1:2].values y = dataset.iloc[:, -1].values
dataset.head()
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
from sklearn.linear_model import LinearRegression regressor = LinearRegression() regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
plt.scatter(X_train, y_train, color="red") plt.plot(X_train, regressor.predict(X_train), color="blue") plt.title('Salary vs Experience (Training set)') plt.xlabel('Years of Experience') plt.ylabel('Salary') plt.show()
plt.scatter(X_test, y_test, color = 'red') plt.plot(X_train, regressor.predict(X_train), color = 'blue') plt.title('Salary vs Experience (Test set)') plt.xlabel('Years of Experience') plt.ylabel('Salary') plt.show()
Facebook
Twitter一、数据来源 来自:https://www.luge.ai/#/luge/dataDetail?id=71
二、描述 1. 政务数据相关,数据集适合于食品安全主题分类系统,对信息数据进行分类,通过模型建立、语义分析等方法筛选出食品安全相关的信息,以助力相关部门监管高效精准。 2. 数据已脱敏,所有涉及地点、姓名、机构单位的关键信息已使用“*”进行替换。 3. 本数据集中,1 =涉及食品安全问题,0 =不涉及.
三、Release Note v2-split: 1. 按照 f"主题:{event_name};详细描述:{content}" 的模板对文字部分进行整合; 2. 并使用 sklearn.model_seletion.train_test_split 将原始数据集划分为 train、dev、test 三部分。
Initial Release: 原始数据集。
Not seeing a result you expected?
Learn how you can add new datasets to our index.
Facebook
TwitterAttribution 4.0 (CC BY 4.0)https://creativecommons.org/licenses/by/4.0/
License information was derived automatically
The data sets are used in a controlled experiment, where two classifiers should be compared. train_a.csv and explain.csv are slices from the original data set. train_b.csv contains the same instances as in train_a.csv, but with feature x1 set to 0 to make it unusable to classifier B.
The original data set was created and split using this Python code:
from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression
X, y = make_classification(n_samples=300, n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=1, class_sep=0.75, random_state=0) X *= 100
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) lm = LogisticRegression() lm.fit(X_train, y_train) clf_a = lm
clf_b = LogisticRegression() X2 = X.copy() X2[:, 0] = 0 X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, test_size=0.5, random_state=0) clf_b.fit(X2_train, y2_train)
X_explain = X_test y_explain = y_test