Facebook
TwitterThis dataset is just a split of the original akemiH/NoteChat.
70% for train 15% for validation 15% for test
Below is the code snipped used to split the dataset.
from datasets import DatasetDict from datasets import load_dataset
DATASET_SRC_NAME = "akemiH/NoteChat" DATASET_DST_NAME = "DanielMontecino/NoteChat"
dataset = load_dataset(DATASET_SRC_NAME, split="train")
train_testvalid = dataset.train_test_split(test_size=0.3, seed=2024)
Facebook
TwitterAttribution 4.0 (CC BY 4.0)https://creativecommons.org/licenses/by/4.0/
License information was derived automatically
The data sets are used in a controlled experiment, where two classifiers should be compared. train_a.csv and explain.csv are slices from the original data set. train_b.csv contains the same instances as in train_a.csv, but with feature x1 set to 0 to make it unusable to classifier B.
The original data set was created and split using this Python code:
from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression
X, y = make_classification(n_samples=300, n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=1, class_sep=0.75, random_state=0) X *= 100
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) lm = LogisticRegression() lm.fit(X_train, y_train) clf_a = lm
clf_b = LogisticRegression() X2 = X.copy() X2[:, 0] = 0 X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, test_size=0.5, random_state=0) clf_b.fit(X2_train, y2_train)
X_explain = X_test y_explain = y_test
Facebook
TwitterA train-test split of https://huggingface.co/datasets/takala/financial_phrasebank and sentences_50agree. The data was created using the following script: import datasets ds = datasets.load_dataset("takala/financial_phrasebank", "sentences_50agree", trust_remote_code=True)['train'] ds = ds.train_test_split(test_size=0.2, seed=42)
Facebook
TwitterApache License, v2.0https://www.apache.org/licenses/LICENSE-2.0
License information was derived automatically
How to use:
pip install datasets
dataset = load_dataset("mabughali/miia-pothole-train", split="train") splits = dataset.train_test_split(test_size=0.2) train_ds = splits['train'] val_ds = splits['test']
Facebook
Twittertraining Code ```Python
from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split import os import pandas as pd import numpy as np os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" TEMP_DIR = "tmp" os.makedirs(TEMP_DIR, exist_ok=True) train = pd.read_csv('input/map-charting-student-math-misunderstandings/train.csv')
train.Misconception = train.Misconception.fillna('NA')
train['target'] = train.Category + ":" + train.Misconception
le = LabelEncoder() train['label'] = le.fit_transform(train['target']) n_classes = len(le.classes_) # Number of unique target classes print(f"Train shape: {train.shape} with {n_classes} target classes") print("Train head:") train.head()
idx = train.apply(lambda row: row.Category.split('_')[0], axis=1) == 'True' correct = train.loc[idx].copy() correct['c'] = correct.groupby(['QuestionId', 'MC_Answer']).MC_Answer.transform('count') correct = correct.sort_values('c', ascending=False) correct = correct.drop_duplicates(['QuestionId']) correct = correct[['QuestionId', 'MC_Answer']] correct['is_correct'] = 1 # Mark these as correct answers
train = train.merge(correct, on=['QuestionId', 'MC_Answer'], how='left') train.is_correct = train.is_correct.fillna(0)
from transformers import AutoTokenizer, AutoModelForSequenceClassification import torch
Model_Name = "unsloth/Meta-Llama-3.1-8B-Instruct"
model = AutoModelForSequenceClassification.from_pretrained(Model_Name, num_labels=n_classes, torch_dtype=torch.bfloat16, device_map="balanced", cache_dir=TEMP_DIR)
tokenizer = AutoTokenizer.from_pretrained(Model_Name, cache_dir=TEMP_DIR)
def format_input(row): x = "Yes" if not row['is_correct']: x = "No" return ( f"Question: {row['QuestionText']} " f"Answer: {row['MC_Answer']} " f"Correct? {x} " f"Student Explanation: {row['StudentExplanation']}" )
train['text'] = train.apply(format_input,axis=1) print("Example prompt for our LLM:") print() print( train.text.values[0] )
from datasets import Dataset
COLS = ['text', 'label']
train_df_clean = train[COLS].copy() # Use 'train' instead of 'train_df'
train_df_clean['label'] = train_df_clean['label'].astype(np.int64)
train_df_clean = train_df_clean.reset_index(drop=True)
train_ds = Dataset.from_pandas(train_df_clean, preserve_index=False)
def tokenize(batch): """Tokenizes a batch of text inputs.""" return tokenizer(batch["text"], truncation=True, max_length=256)
train_ds = train_ds.map(tokenize, batched=True, remove_columns=['text'])
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id
import os from huggingface_hub import scan_cache_dir
cache_info = scan_cache_dir() cache_info.delete_revisions(*[repo.revisions for repo in cache_info.repos]).execute()
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding import tempfile import shutil
os.makedirs(f"{TEMP_DIR}/training_output/", exist_ok=True) os.makedirs(f"{TEMP_DIR}/logs/", exist_ok=True)
training_args = TrainingArguments(
output_dir=f"{TEMP_DIR}/training_output/",
do_train=True,
do_eval=False,
save_strategy="no",
num_train_epochs=3,
per_device_train_batch_size=16,
learning_rate=5e-5,
logging_dir=f"{TEMP_DIR}/logs/",
logging_steps=500,
bf16=True,
fp16=False,
report_to="none",
warmup_ratio=0.1,
lr_scheduler_type="cosine",
dataloader_pin_memory=False,
gradient_checkpointing=True,
)
def compute_map3(eval_pred): """ Computes Mean Average Precision at 3 (MAP@3) for evaluation. """ logits, labels = eval_pred probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
# Get top 3 predicted class indi...
Facebook
TwitterSubsampling of the dataset Amazon_employee_access (4135) with
seed=4 args.nrows=2000 args.ncols=100 args.nclasses=10 args.no_stratify=True Generated with the following source code:
def subsample(
self,
seed: int,
nrows_max: int = 2_000,
ncols_max: int = 100,
nclasses_max: int = 10,
stratified: bool = True,
) -> Dataset:
rng = np.random.default_rng(seed)
x = self.x
y = self.y
# Uniformly sample
classes = y.unique()
if len(classes) > nclasses_max:
vcs = y.value_counts()
selected_classes = rng.choice(
classes,
size=nclasses_max,
replace=False,
p=vcs / sum(vcs),
)
# Select the indices where one of these classes is present
idxs = y.index[y.isin(classes)]
x = x.iloc[idxs]
y = y.iloc[idxs]
# Uniformly sample columns if required
if len(x.columns) > ncols_max:
columns_idxs = rng.choice(
list(range(len(x.columns))), size=ncols_max, replace=False
)
sorted_column_idxs = sorted(columns_idxs)
selected_columns = list(x.columns[sorted_column_idxs])
x = x[selected_columns]
else:
sorted_column_idxs = list(range(len(x.columns)))
if len(x) > nrows_max:
# Stratify accordingly
target_name = y.name
data = pd.concat((x, y), axis="columns")
_, subset = train_test_split(
data,
test_size=nrows_max,
stratify=data[target_name],
shuffle=True,
random_state=seed,
)
x = subset.drop(target_name, axis="columns")
y = subset[target_name]
# We need to convert categorical columns to string for openml
categorical_mask = [self.categorical_mask[i] for i in sorted_column_idxs]
columns = list(x.columns)
return Dataset(
# Technically this is not the same but it's where it was derived from
dataset=self.dataset,
x=x,
y=y,
categorical_mask=categorical_mask,
columns=columns,
)
Facebook
Twitterfrom datasets import load_dataset, DatasetDict ds = load_dataset("anton-l/earnings22_robust", split="test") print(ds) print(" ", "Split to ==>", " ")
train_devtest = ds.train_test_split(shuffle=True, seed=1, test_size=0.1) dev_test = train_devtest['test'].train_test_split(shuffle=True, seed=1, test_size=0.5) ds_train_dev_test = DatasetDict({'train': train_devtest['train'], 'validation': dev_test['train'], 'test':… See the full description on the dataset page: https://huggingface.co/datasets/sanchit-gandhi/earnings22_robust_split.
Facebook
TwitterMIT Licensehttps://opensource.org/licenses/MIT
License information was derived automatically
Example of usage: from datasets import load_dataset
dataset = load_dataset("Andron00e/CIFAR100-custom") splitted_dataset = dataset["train"].train_test_split(test_size=0.2)
Facebook
Twitterhttps://creativecommons.org/publicdomain/zero/1.0/https://creativecommons.org/publicdomain/zero/1.0/
import numpy as np import pandas as pd import matplotlib.pyplot as plt
dataset = pd.read_csv('Salary_dataset.csv') X = dataset.iloc[:, 1:2].values y = dataset.iloc[:, -1].values
dataset.head()
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
from sklearn.linear_model import LinearRegression regressor = LinearRegression() regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
plt.scatter(X_train, y_train, color="red") plt.plot(X_train, regressor.predict(X_train), color="blue") plt.title('Salary vs Experience (Training set)') plt.xlabel('Years of Experience') plt.ylabel('Salary') plt.show()
plt.scatter(X_test, y_test, color = 'red') plt.plot(X_train, regressor.predict(X_train), color = 'blue') plt.title('Salary vs Experience (Test set)') plt.xlabel('Years of Experience') plt.ylabel('Salary') plt.show()
Facebook
TwitterSubsampling of the dataset connect-4 (40668) with
seed=2 args.nrows=2000 args.ncols=100 args.nclasses=10 args.no_stratify=True Generated with the following source code:
def subsample(
self,
seed: int,
nrows_max: int = 2_000,
ncols_max: int = 100,
nclasses_max: int = 10,
stratified: bool = True,
) -> Dataset:
rng = np.random.default_rng(seed)
x = self.x
y = self.y
# Uniformly sample
classes = y.unique()
if len(classes) > nclasses_max:
vcs = y.value_counts()
selected_classes = rng.choice(
classes,
size=nclasses_max,
replace=False,
p=vcs / sum(vcs),
)
# Select the indices where one of these classes is present
idxs = y.index[y.isin(classes)]
x = x.iloc[idxs]
y = y.iloc[idxs]
# Uniformly sample columns if required
if len(x.columns) > ncols_max:
columns_idxs = rng.choice(
list(range(len(x.columns))), size=ncols_max, replace=False
)
sorted_column_idxs = sorted(columns_idxs)
selected_columns = list(x.columns[sorted_column_idxs])
x = x[selected_columns]
else:
sorted_column_idxs = list(range(len(x.columns)))
if len(x) > nrows_max:
# Stratify accordingly
target_name = y.name
data = pd.concat((x, y), axis="columns")
_, subset = train_test_split(
data,
test_size=nrows_max,
stratify=data[target_name],
shuffle=True,
random_state=seed,
)
x = subset.drop(target_name, axis="columns")
y = subset[target_name]
# We need to convert categorical columns to string for openml
categorical_mask = [self.categorical_mask[i] for i in sorted_column_idxs]
columns = list(x.columns)
return Dataset(
# Technically this is not the same but it's where it was derived from
dataset=self.dataset,
x=x,
y=y,
categorical_mask=categorical_mask,
columns=columns,
)
Facebook
TwitterSubsampling of the dataset car (40975) with
seed=2 args.nrows=2000 args.ncols=100 args.nclasses=10 args.no_stratify=True Generated with the following source code:
def subsample(
self,
seed: int,
nrows_max: int = 2_000,
ncols_max: int = 100,
nclasses_max: int = 10,
stratified: bool = True,
) -> Dataset:
rng = np.random.default_rng(seed)
x = self.x
y = self.y
# Uniformly sample
classes = y.unique()
if len(classes) > nclasses_max:
vcs = y.value_counts()
selected_classes = rng.choice(
classes,
size=nclasses_max,
replace=False,
p=vcs / sum(vcs),
)
# Select the indices where one of these classes is present
idxs = y.index[y.isin(classes)]
x = x.iloc[idxs]
y = y.iloc[idxs]
# Uniformly sample columns if required
if len(x.columns) > ncols_max:
columns_idxs = rng.choice(
list(range(len(x.columns))), size=ncols_max, replace=False
)
sorted_column_idxs = sorted(columns_idxs)
selected_columns = list(x.columns[sorted_column_idxs])
x = x[selected_columns]
else:
sorted_column_idxs = list(range(len(x.columns)))
if len(x) > nrows_max:
# Stratify accordingly
target_name = y.name
data = pd.concat((x, y), axis="columns")
_, subset = train_test_split(
data,
test_size=nrows_max,
stratify=data[target_name],
shuffle=True,
random_state=seed,
)
x = subset.drop(target_name, axis="columns")
y = subset[target_name]
# We need to convert categorical columns to string for openml
categorical_mask = [self.categorical_mask[i] for i in sorted_column_idxs]
columns = list(x.columns)
return Dataset(
# Technically this is not the same but it's where it was derived from
dataset=self.dataset,
x=x,
y=y,
categorical_mask=categorical_mask,
columns=columns,
)
Facebook
TwitterSubsampling of the dataset Internet-Advertisements (40978) with
seed=2 args.nrows=2000 args.ncols=100 args.nclasses=10 args.no_stratify=True Generated with the following source code:
def subsample(
self,
seed: int,
nrows_max: int = 2_000,
ncols_max: int = 100,
nclasses_max: int = 10,
stratified: bool = True,
) -> Dataset:
rng = np.random.default_rng(seed)
x = self.x
y = self.y
# Uniformly sample
classes = y.unique()
if len(classes) > nclasses_max:
vcs = y.value_counts()
selected_classes = rng.choice(
classes,
size=nclasses_max,
replace=False,
p=vcs / sum(vcs),
)
# Select the indices where one of these classes is present
idxs = y.index[y.isin(classes)]
x = x.iloc[idxs]
y = y.iloc[idxs]
# Uniformly sample columns if required
if len(x.columns) > ncols_max:
columns_idxs = rng.choice(
list(range(len(x.columns))), size=ncols_max, replace=False
)
sorted_column_idxs = sorted(columns_idxs)
selected_columns = list(x.columns[sorted_column_idxs])
x = x[selected_columns]
else:
sorted_column_idxs = list(range(len(x.columns)))
if len(x) > nrows_max:
# Stratify accordingly
target_name = y.name
data = pd.concat((x, y), axis="columns")
_, subset = train_test_split(
data,
test_size=nrows_max,
stratify=data[target_name],
shuffle=True,
random_state=seed,
)
x = subset.drop(target_name, axis="columns")
y = subset[target_name]
# We need to convert categorical columns to string for openml
categorical_mask = [self.categorical_mask[i] for i in sorted_column_idxs]
columns = list(x.columns)
return Dataset(
# Technically this is not the same but it's where it was derived from
dataset=self.dataset,
x=x,
y=y,
categorical_mask=categorical_mask,
columns=columns,
)
Facebook
Twitterhttps://creativecommons.org/publicdomain/zero/1.0/https://creativecommons.org/publicdomain/zero/1.0/
Various methods are used to solve TSP Oct 2021, including the use of LightGBM , CatBoost and XGBoost, in which hyperparametersplay an important role.
We use hyperparameter optimization framework like Optuna to find hyperparameter. Another way is to use parameters that have already been created and had good results.
In this database, I collected all the parameters of LightGBM , CatBoost and XGBoost introduced in the TPS Oct 2021.
All parameters are checked under one condition. I used the following specifications to find the accuracy of each parameter. This is not the final accuracy because it is measured with only 20% of the data, but it is a criterion for comparing the parameters.
train_20 = dt.fread(sample_train_20.csv', columns=lambda cols:[col.name not in ('id') for col in cols]).to_pandas()
y = train_20['target']
X = train_20.drop(columns=['target'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1 / 5), random_state=59)
model = model_from_csv(**params_from_csv)
model.fit(
X_train, y_train,
eval_set=[(X_test, y_test)],
eval_metric=["auc"],
verbose=False,
early_stopping_rounds = 600
)
y_predicted = model.predict_proba(X_test)
accuracy = roc_auc_score(y_test, y_predicted[:, 1])
I will try to create for future competitions as well if it is helpful for you, If you think this dataset are helpful for you, Please do not forget upvote this dataset Thank you in advance
Facebook
TwitterSubsampling of the dataset kr-vs-kp (3) with
seed=1 args.nrows=2000 args.ncols=100 args.nclasses=10 args.no_stratify=True Generated with the following source code:
def subsample(
self,
seed: int,
nrows_max: int = 2_000,
ncols_max: int = 100,
nclasses_max: int = 10,
stratified: bool = True,
) -> Dataset:
rng = np.random.default_rng(seed)
x = self.x
y = self.y
# Uniformly sample
classes = y.unique()
if len(classes) > nclasses_max:
vcs = y.value_counts()
selected_classes = rng.choice(
classes,
size=nclasses_max,
replace=False,
p=vcs / sum(vcs),
)
# Select the indices where one of these classes is present
idxs = y.index[y.isin(classes)]
x = x.iloc[idxs]
y = y.iloc[idxs]
# Uniformly sample columns if required
if len(x.columns) > ncols_max:
columns_idxs = rng.choice(
list(range(len(x.columns))), size=ncols_max, replace=False
)
sorted_column_idxs = sorted(columns_idxs)
selected_columns = list(x.columns[sorted_column_idxs])
x = x[selected_columns]
else:
sorted_column_idxs = list(range(len(x.columns)))
if len(x) > nrows_max:
# Stratify accordingly
target_name = y.name
data = pd.concat((x, y), axis="columns")
_, subset = train_test_split(
data,
test_size=nrows_max,
stratify=data[target_name],
shuffle=True,
random_state=seed,
)
x = subset.drop(target_name, axis="columns")
y = subset[target_name]
# We need to convert categorical columns to string for openml
categorical_mask = [self.categorical_mask[i] for i in sorted_column_idxs]
columns = list(x.columns)
return Dataset(
# Technically this is not the same but it's where it was derived from
dataset=self.dataset,
x=x,
y=y,
categorical_mask=categorical_mask,
columns=columns,
)
Not seeing a result you expected?
Learn how you can add new datasets to our index.
Facebook
TwitterThis dataset is just a split of the original akemiH/NoteChat.
70% for train 15% for validation 15% for test
Below is the code snipped used to split the dataset.
from datasets import DatasetDict from datasets import load_dataset
DATASET_SRC_NAME = "akemiH/NoteChat" DATASET_DST_NAME = "DanielMontecino/NoteChat"
dataset = load_dataset(DATASET_SRC_NAME, split="train")
train_testvalid = dataset.train_test_split(test_size=0.3, seed=2024)