100+ datasets found
  1. Images used for training, validation, and testing.

    • kaggle.com
    Updated Mar 15, 2024
    Share
    FacebookFacebook
    TwitterTwitter
    Email
    Click to copy link
    Link copied
    Close
    Cite
    Chrysthian Chrisley (2024). Images used for training, validation, and testing. [Dataset]. https://www.kaggle.com/datasets/chrysthian/images-used-for-training-validation-and-testing
    Explore at:
    CroissantCroissant is a format for machine-learning datasets. Learn more about this at mlcommons.org/croissant.
    Dataset updated
    Mar 15, 2024
    Dataset provided by
    Kaggle
    Authors
    Chrysthian Chrisley
    License

    Attribution-ShareAlike 3.0 (CC BY-SA 3.0)https://creativecommons.org/licenses/by-sa/3.0/
    License information was derived automatically

    Description

    Imports:

    # All Imports
    import os
    from matplotlib import pyplot as plt
    import pandas as pd
    from sklearn.calibration import LabelEncoder
    import seaborn as sns
    import matplotlib.image as mpimg
    import cv2
    import numpy as np
    import pickle
    
    # Tensflor and Keras Layer and Model and Optimize and Loss
    import tensorflow as tf
    from tensorflow import keras
    from keras import Sequential
    from keras.layers import *
    
    #Kernel Intilizer 
    from keras.optimizers import Adamax
    
    # PreTrained Model
    from keras.applications import *
    
    #Early Stopping
    from keras.callbacks import EarlyStopping
    import warnings 
    

    Warnings Suppression | Configuration

    # Warnings Remove 
    warnings.filterwarnings("ignore")
    
    # Define the base path for the training folder
    base_path = 'jaguar_cheetah/train'
    
    # Weights file
    weights_file = 'Model_train_weights.weights.h5'
    
    # Path to the saved or to save the model:
    model_file = 'Model-cheetah_jaguar_Treined.keras'
    
    # Model history
    history_path = 'training_history_cheetah_jaguar.pkl'
    
    # Initialize lists to store file paths and labels
    filepaths = []
    labels = []
    
    # Iterate over folders and files within the training directory
    for folder in ['Cheetah', 'Jaguar']:
      folder_path = os.path.join(base_path, folder)
      for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        filepaths.append(file_path)
        labels.append(folder)
    
    # Create the TRAINING dataframe
    file_path_series = pd.Series(filepaths , name= 'filepath')
    Label_path_series = pd.Series(labels , name = 'label')
    df_train = pd.concat([file_path_series ,Label_path_series ] , axis = 1)
    
    
    # Define the base path for the test folder
    directory = "jaguar_cheetah/test"
    
    filepath =[]
    label = []
    
    folds = os.listdir(directory)
    
    for fold in folds:
      f_path = os.path.join(directory , fold)
      
      imgs = os.listdir(f_path)
      
      for img in imgs:
        
        img_path = os.path.join(f_path , img)
        filepath.append(img_path)
        label.append(fold)
        
    # Create the TEST dataframe
    file_path_series = pd.Series(filepath , name= 'filepath')
    Label_path_series = pd.Series(label , name = 'label')
    df_test = pd.concat([file_path_series ,Label_path_series ] , axis = 1)
    
    # Display the first rows of the dataframe for verification
    #print(df_train)
    
    # Folders with Training and Test files
    data_dir = 'jaguar_cheetah/train'
    test_dir = 'jaguar_cheetah/test'
    
    # Image size 256x256
    IMAGE_SIZE = (256,256) 
    

    Tain | Test

    #print('Training Images:')
    
    # Create the TRAIN dataframe
    train_ds = tf.keras.utils.image_dataset_from_directory(
      data_dir,
      validation_split=0.1,
      subset='training',
      seed=123,
      image_size=IMAGE_SIZE,
      batch_size=32)
    
    #Testing Data
    #print('Validation Images:')
    validation_ds = tf.keras.utils.image_dataset_from_directory(
      data_dir, 
      validation_split=0.1,
      subset='validation',
      seed=123,
      image_size=IMAGE_SIZE,
      batch_size=32)
    
    print('Testing Images:')
    test_ds = tf.keras.utils.image_dataset_from_directory(
      test_dir, 
      seed=123,
      image_size=IMAGE_SIZE,
      batch_size=32)
    
    # Extract labels
    train_labels = train_ds.class_names
    test_labels = test_ds.class_names
    validation_labels = validation_ds.class_names
    
    # Encode labels
    # Defining the class labels
    class_labels = ['CHEETAH', 'JAGUAR'] 
    
    # Instantiate (encoder) LabelEncoder
    label_encoder = LabelEncoder()
    
    # Fit the label encoder on the class labels
    label_encoder.fit(class_labels)
    
    # Transform the labels for the training dataset
    train_labels_encoded = label_encoder.transform(train_labels)
    
    # Transform the labels for the validation dataset
    validation_labels_encoded = label_encoder.transform(validation_labels)
    
    # Transform the labels for the testing dataset
    test_labels_encoded = label_encoder.transform(test_labels)
    
    # Normalize the pixel values
    
    # Train files 
    train_ds = train_ds.map(lambda x, y: (x / 255.0, y))
    # Validate files
    validation_ds = validation_ds.map(lambda x, y: (x / 255.0, y))
    # Test files
    test_ds = test_ds.map(lambda x, y: (x / 255.0, y))
    
    #TRAINING VISUALIZATION
    #Count the occurrences of each category in the column
    count = df_train['label'].value_counts()
    
    # Create a figure with 2 subplots
    fig, axs = plt.subplots(1, 2, figsize=(12, 6), facecolor='white')
    
    # Plot a pie chart on the first subplot
    palette = sns.color_palette("viridis")
    sns.set_palette(palette)
    axs[0].pie(count, labels=count.index, autopct='%1.1f%%', startangle=140)
    axs[0].set_title('Distribution of Training Categories')
    
    # Plot a bar chart on the second subplot
    sns.barplot(x=count.index, y=count.values, ax=axs[1], palette="viridis")
    axs[1].set_title('Count of Training Categories')
    
    # Adjust the layout
    plt.tight_layout()
    
    # Visualize
    plt.show()
    
    # TEST VISUALIZATION
    count = df_test['label'].value_counts()
    
    # Create a figure with 2 subplots
    fig, axs = plt.subplots(1, 2, figsize=(12, 6), facec...
    
  2. Machine learning algorithm validation with a limited sample size

    • plos.figshare.com
    text/x-python
    Updated May 30, 2023
    Share
    FacebookFacebook
    TwitterTwitter
    Email
    Click to copy link
    Link copied
    Close
    Cite
    Andrius Vabalas; Emma Gowen; Ellen Poliakoff; Alexander J. Casson (2023). Machine learning algorithm validation with a limited sample size [Dataset]. http://doi.org/10.1371/journal.pone.0224365
    Explore at:
    text/x-pythonAvailable download formats
    Dataset updated
    May 30, 2023
    Dataset provided by
    PLOShttp://plos.org/
    Authors
    Andrius Vabalas; Emma Gowen; Ellen Poliakoff; Alexander J. Casson
    License

    Attribution 4.0 (CC BY 4.0)https://creativecommons.org/licenses/by/4.0/
    License information was derived automatically

    Description

    Advances in neuroimaging, genomic, motion tracking, eye-tracking and many other technology-based data collection methods have led to a torrent of high dimensional datasets, which commonly have a small number of samples because of the intrinsic high cost of data collection involving human participants. High dimensional data with a small number of samples is of critical importance for identifying biomarkers and conducting feasibility and pilot work, however it can lead to biased machine learning (ML) performance estimates. Our review of studies which have applied ML to predict autistic from non-autistic individuals showed that small sample size is associated with higher reported classification accuracy. Thus, we have investigated whether this bias could be caused by the use of validation methods which do not sufficiently control overfitting. Our simulations show that K-fold Cross-Validation (CV) produces strongly biased performance estimates with small sample sizes, and the bias is still evident with sample size of 1000. Nested CV and train/test split approaches produce robust and unbiased performance estimates regardless of sample size. We also show that feature selection if performed on pooled training and testing data is contributing to bias considerably more than parameter tuning. In addition, the contribution to bias by data dimensionality, hyper-parameter space and number of CV folds was explored, and validation methods were compared with discriminable data. The results suggest how to design robust testing methodologies when working with small datasets and how to interpret the results of other studies based on what validation method was used.

  3. Training and Validation Datasets for Neural Network to Fill in Missing Data...

    • catalog.data.gov
    • gimi9.com
    Updated Jul 9, 2025
    Share
    FacebookFacebook
    TwitterTwitter
    Email
    Click to copy link
    Link copied
    Close
    Cite
    National Institute of Standards and Technology (2025). Training and Validation Datasets for Neural Network to Fill in Missing Data in EBSD Maps [Dataset]. https://catalog.data.gov/dataset/training-and-validation-datasets-for-neural-network-to-fill-in-missing-data-in-ebsd-maps
    Explore at:
    Dataset updated
    Jul 9, 2025
    Dataset provided by
    National Institute of Standards and Technologyhttp://www.nist.gov/
    Description

    This dataset consists of the synthetic electron backscatter diffraction (EBSD) maps generated for the paper, titled "Hybrid Algorithm for Filling in Missing Data in Electron Backscatter Diffraction Maps" by Emmanuel Atindama, Conor Miller-Lynch, Huston Wilhite, Cody Mattice, Günay Doğan, and Prashant Athavale. The EBSD maps were used to train, test, and validate a neural network algorithm to fill in missing data points in a given EBSD map.The dataset includes 8000 maps for training, 1000 maps for testing, 2000 maps for validation. The dataset also includes noise-added versions of the maps, namely, one more map per each clean map.

  4. i

    Dataset for training

    • ieee-dataport.org
    Updated Sep 8, 2025
    Share
    FacebookFacebook
    TwitterTwitter
    Email
    Click to copy link
    Link copied
    Close
    Cite
    Junfeng Zhao (2025). Dataset for training [Dataset]. https://ieee-dataport.org/documents/dataset-training-validation-and-testing-1d-ml-dft
    Explore at:
    Dataset updated
    Sep 8, 2025
    Authors
    Junfeng Zhao
    Description

    organized within the "ions" and "molecules" folders

  5. File Validation and Training Statistics

    • kaggle.com
    zip
    Updated Dec 1, 2023
    Share
    FacebookFacebook
    TwitterTwitter
    Email
    Click to copy link
    Link copied
    Close
    Cite
    The Devastator (2023). File Validation and Training Statistics [Dataset]. https://www.kaggle.com/datasets/thedevastator/file-validation-and-training-statistics
    Explore at:
    zip(16413235 bytes)Available download formats
    Dataset updated
    Dec 1, 2023
    Authors
    The Devastator
    License

    https://creativecommons.org/publicdomain/zero/1.0/https://creativecommons.org/publicdomain/zero/1.0/

    Description

    File Validation and Training Statistics

    Validation, Training, and Testing Statistics for tasksource/leandojo Files

    By tasksource (From Huggingface) [source]

    About this dataset

    The tasksource/leandojo: File Validation, Training, and Testing Statistics dataset is a comprehensive collection of information regarding the validation, training, and testing processes of files in the tasksource/leandojo repository. This dataset is essential for gaining insights into the file management practices within this specific repository.

    The dataset consists of three distinct files: validation.csv, train.csv, and test.csv. Each file serves a unique purpose in providing statistics and information about the different stages involved in managing files within the repository.

    In validation.csv, you will find detailed information about the validation process undergone by each file. This includes data such as file paths within the repository (file_path), full names of each file (full_name), associated commit IDs (commit), traced tactics implemented (traced_tactics), URLs pointing to each file (url), and respective start and end dates for validation.

    train.csv focuses on providing valuable statistics related to the training phase of files. Here, you can access data such as file paths within the repository (file_path), full names of individual files (full_name), associated commit IDs (commit), traced tactics utilized during training activities (traced_tactics), URLs linking to each specific file undergoing training procedures (url).

    Lastly, test.csv encompasses pertinent statistics concerning testing activities performed on different files within the tasksource/leandojo repository. This data includes information such as file paths within the repo structure (file_path), full names assigned to each individual file tested (full_name) , associated commit IDs linked with these files' versions being tested(commit) , traced tactics incorporated during testing procedures regarded(traced_tactics) ,relevant URLs directing to specific tested files(url).

    By exploring this comprehensive dataset consisting of three separate CSV files - validation.csv, train.csv, test.csv - researchers can gain crucial insights into how effective strategies pertaining to validating ,training or testing tasks have been implemented in order to maintain high-quality standards within the tasksource/leandojo repository

    How to use the dataset

    • Familiarize Yourself with the Dataset Structure:

      • The dataset consists of three separate files: validation.csv, train.csv, and test.csv.
      • Each file contains multiple columns providing different information about file validation, training, and testing.
    • Explore the Columns:

      • 'file_path': This column represents the path of the file within the repository.
      • 'full_name': This column displays the full name of each file.
      • 'commit': The commit ID associated with each file is provided in this column.
      • 'traced_tactics': The tactics traced in each file are listed in this column.
      • 'url': This column provides the URL of each file.
    • Understand Each File's Purpose:

    Validation.csv - This file contains information related to the validation process of files in the tasksource/leandojo repository.

    Train.csv - Utilize this file if you need statistics and information regarding the training phase of files in tasksource/leandojo repository.

    Test.csv - For insights into statistics and information about testing individual files within tasksource/leandojo repository, refer to this file.

    • Generate Insights & Analyze Data:
    • Once you have a clear understanding of each column's purpose, you can start generating insights from your analysis using various statistical techniques or machine learning algorithms.
    • Explore patterns or trends by examining specific columns such as 'traced_tactics' or analyzing multiple columns together.

    • Combine Multiple Files (if necessary):

    • If required, you can merge/correlate data across different csv files based on common fields such as 'file_path', 'full_name', or 'commit'.

    • Visualize the Data (Optional):

    • To enhance your analysis, consider creating visualizations such as plots, charts, or graphs. Visualization can offer a clear representation of patterns or relationships within the dataset.

    • Obtain Further Information:

    • If you need additional details about any specific file, make use of the provided 'url' column to access further information.

    Remember that this guide provides a general overview of how to utilize this dataset effectively. Feel ...

  6. t

    MS Training Set, MS Validation Set, and UW Validation/Test Set - Dataset -...

    • service.tib.eu
    Updated Dec 17, 2024
    Share
    FacebookFacebook
    TwitterTwitter
    Email
    Click to copy link
    Link copied
    Close
    Cite
    (2024). MS Training Set, MS Validation Set, and UW Validation/Test Set - Dataset - LDM [Dataset]. https://service.tib.eu/ldmservice/dataset/ms-training-set--ms-validation-set--and-uw-validation-test-set
    Explore at:
    Dataset updated
    Dec 17, 2024
    Description

    The MS Training Set, MS Validation Set, and UW Validation/Test Set are used for training, validation, and testing the proposed methods.

  7. Data from: Web Data Commons Training and Test Sets for Large-Scale Product...

    • linkagelibrary.icpsr.umich.edu
    • da-ra.de
    Updated Nov 26, 2020
    + more versions
    Share
    FacebookFacebook
    TwitterTwitter
    Email
    Click to copy link
    Link copied
    Close
    Cite
    Ralph Peeters; Anna Primpeli; Christian Bizer (2020). Web Data Commons Training and Test Sets for Large-Scale Product Matching - Version 2.0 [Dataset]. http://doi.org/10.3886/E127481V1
    Explore at:
    Dataset updated
    Nov 26, 2020
    Dataset provided by
    University of Mannheim (Germany)
    Authors
    Ralph Peeters; Anna Primpeli; Christian Bizer
    License

    Attribution 4.0 (CC BY 4.0)https://creativecommons.org/licenses/by/4.0/
    License information was derived automatically

    Description

    Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label “match” or “no match”) for four product categories, computers, cameras, watches and shoes. In order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test sets. For each product category, we provide training sets in four different sizes (2.000-70.000 pairs). Furthermore there are sets of ids for each training set for a possible validation split (stratified random draw) available. The test set for each product category consists of 1.100 product pairs. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web weak supervision. The data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites. For more information and download links for the corpus itself, please follow the links below.

  8. Validation Data and Control Software for ATIC: Automated Testbed for...

    • catalog.data.gov
    • nist.gov
    Updated Dec 15, 2023
    Share
    FacebookFacebook
    TwitterTwitter
    Email
    Click to copy link
    Link copied
    Close
    Cite
    National Institute of Standards and Technology (2023). Validation Data and Control Software for ATIC: Automated Testbed for Interference Testing in Communication Systems [Dataset]. https://catalog.data.gov/dataset/validation-data-and-control-software-for-atic-automated-testbed-for-interference-testing-i-61d38
    Explore at:
    Dataset updated
    Dec 15, 2023
    Dataset provided by
    National Institute of Standards and Technologyhttp://www.nist.gov/
    Description

    Validation data and software for the paper, "ATIC: Automated Testbed for Interference Testing in Communication Systems," to appear in Proceedings of 2023 IEEE Military Communications Conference. See the README file for descriptions of the data files. Software is available at https://github.com/usnistgov/atic.

  9. give us the data validation test set

    • kaggle.com
    zip
    Updated Apr 23, 2021
    Share
    FacebookFacebook
    TwitterTwitter
    Email
    Click to copy link
    Link copied
    Close
    Cite
    Anna (2021). give us the data validation test set [Dataset]. https://www.kaggle.com/annatmp/give-us-the-data-validation-test-set
    Explore at:
    zip(439562080 bytes)Available download formats
    Dataset updated
    Apr 23, 2021
    Authors
    Anna
    Description

    Dataset

    This dataset was created by Anna

    Contents

  10. F

    Data from: A Neural Approach for Text Extraction from Scholarly Figures

    • data.uni-hannover.de
    zip
    Updated Jan 20, 2022
    Share
    FacebookFacebook
    TwitterTwitter
    Email
    Click to copy link
    Link copied
    Close
    Cite
    TIB (2022). A Neural Approach for Text Extraction from Scholarly Figures [Dataset]. https://data.uni-hannover.de/dataset/a-neural-approach-for-text-extraction-from-scholarly-figures
    Explore at:
    zipAvailable download formats
    Dataset updated
    Jan 20, 2022
    Dataset authored and provided by
    TIB
    License

    Attribution 3.0 (CC BY 3.0)https://creativecommons.org/licenses/by/3.0/
    License information was derived automatically

    Description

    A Neural Approach for Text Extraction from Scholarly Figures

    This is the readme for the supplemental data for our ICDAR 2019 paper.

    You can read our paper via IEEE here: https://ieeexplore.ieee.org/document/8978202

    If you found this dataset useful, please consider citing our paper:

    @inproceedings{DBLP:conf/icdar/MorrisTE19,
     author  = {David Morris and
            Peichen Tang and
            Ralph Ewerth},
     title   = {A Neural Approach for Text Extraction from Scholarly Figures},
     booktitle = {2019 International Conference on Document Analysis and Recognition,
            {ICDAR} 2019, Sydney, Australia, September 20-25, 2019},
     pages   = {1438--1443},
     publisher = {{IEEE}},
     year   = {2019},
     url    = {https://doi.org/10.1109/ICDAR.2019.00231},
     doi    = {10.1109/ICDAR.2019.00231},
     timestamp = {Tue, 04 Feb 2020 13:28:39 +0100},
     biburl  = {https://dblp.org/rec/conf/icdar/MorrisTE19.bib},
     bibsource = {dblp computer science bibliography, https://dblp.org}
    }
    

    This work was financially supported by the German Federal Ministry of Education and Research (BMBF) and European Social Fund (ESF) (InclusiveOCW project, no. 01PE17004).

    Datasets

    We used different sources of data for testing, validation, and training. Our testing set was assembled by the work we cited by Böschen et al. We excluded the DeGruyter dataset, and use it as our validation dataset.

    Testing

    These datasets contain a readme with license information. Further information about the associated project can be found in the authors' published work we cited: https://doi.org/10.1007/978-3-319-51811-4_2

    Validation

    The DeGruyter dataset does not include the labeled images due to license restrictions. As of writing, the images can still be downloaded from DeGruyter via the links in the readme. Note that depending on what program you use to strip the images out of the PDF they are provided in, you may have to re-number the images.

    Training

    We used label_generator's generated dataset, which the author made available on a requester-pays amazon s3 bucket. We also used the Multi-Type Web Images dataset, which is mirrored here.

    Code

    We have made our code available in code.zip. We will upload code, announce further news, and field questions via the github repo.

    Our text detection network is adapted from Argman's EAST implementation. The EAST/checkpoints/ours subdirectory contains the trained weights we used in the paper.

    We used a tesseract script to run text extraction from detected text rows. This is inside our code code.tar as text_recognition_multipro.py.

    We used a java script provided by Falk Böschen and adapted to our file structure. We included this as evaluator.jar.

    Parameter sweeps are automated by param_sweep.rb. This file also shows how to invoke all of these components.

  11. Data from: Robust Validation: Confident Predictions Even When Distributions...

    • tandf.figshare.com
    bin
    Updated Dec 26, 2023
    Share
    FacebookFacebook
    TwitterTwitter
    Email
    Click to copy link
    Link copied
    Close
    Cite
    Maxime Cauchois; Suyash Gupta; Alnur Ali; John C. Duchi (2023). Robust Validation: Confident Predictions Even When Distributions Shift* [Dataset]. http://doi.org/10.6084/m9.figshare.24904721.v1
    Explore at:
    binAvailable download formats
    Dataset updated
    Dec 26, 2023
    Dataset provided by
    Taylor & Francishttps://taylorandfrancis.com/
    Authors
    Maxime Cauchois; Suyash Gupta; Alnur Ali; John C. Duchi
    License

    Attribution 4.0 (CC BY 4.0)https://creativecommons.org/licenses/by/4.0/
    License information was derived automatically

    Description

    While the traditional viewpoint in machine learning and statistics assumes training and testing samples come from the same population, practice belies this fiction. One strategy—coming from robust statistics and optimization—is thus to build a model robust to distributional perturbations. In this paper, we take a different approach to describe procedures for robust predictive inference, where a model provides uncertainty estimates on its predictions rather than point predictions. We present a method that produces prediction sets (almost exactly) giving the right coverage level for any test distribution in an f-divergence ball around the training population. The method, based on conformal inference, achieves (nearly) valid coverage in finite samples, under only the condition that the training data be exchangeable. An essential component of our methodology is to estimate the amount of expected future data shift and build robustness to it; we develop estimators and prove their consistency for protection and validity of uncertainty estimates under shifts. By experimenting on several large-scale benchmark datasets, including Recht et al.’s CIFAR-v4 and ImageNet-V2 datasets, we provide complementary empirical results that highlight the importance of robust predictive validity.

  12. f

    Training, validation and test datasets and model files for larger US Health...

    • ufs.figshare.com
    txt
    Updated Dec 12, 2023
    Share
    FacebookFacebook
    TwitterTwitter
    Email
    Click to copy link
    Link copied
    Close
    Cite
    Jan Marthinus Blomerus (2023). Training, validation and test datasets and model files for larger US Health Insurance dataset [Dataset]. http://doi.org/10.38140/ufs.24598881.v2
    Explore at:
    txtAvailable download formats
    Dataset updated
    Dec 12, 2023
    Dataset provided by
    University of the Free State
    Authors
    Jan Marthinus Blomerus
    License

    Attribution 4.0 (CC BY 4.0)https://creativecommons.org/licenses/by/4.0/
    License information was derived automatically

    Description

    Formats1.xlsx contains the descriptions of the columns of the following datasets: Training, validation and test datasets in combination are all the records.sens1.csv and and meansdX.csv are required for testing.

  13. R

    10 Validation Testing Brightness Dataset

    • universe.roboflow.com
    zip
    Updated May 24, 2023
    Share
    FacebookFacebook
    TwitterTwitter
    Email
    Click to copy link
    Link copied
    Close
    Cite
    University Science Malaysia (2023). 10 Validation Testing Brightness Dataset [Dataset]. https://universe.roboflow.com/university-science-malaysia/10-validation-testing-brightness
    Explore at:
    zipAvailable download formats
    Dataset updated
    May 24, 2023
    Dataset authored and provided by
    University Science Malaysia
    License

    Attribution 4.0 (CC BY 4.0)https://creativecommons.org/licenses/by/4.0/
    License information was derived automatically

    Variables measured
    10 Validation Testing Brightness Bounding Boxes
    Description

    10 Validation Testing Brightness

    ## Overview
    
    10 Validation Testing Brightness is a dataset for object detection tasks - it contains 10 Validation Testing Brightness annotations for 464 images.
    
    ## Getting Started
    
    You can download this dataset for use within your own projects, or fork it into a workspace on Roboflow to create your own model.
    
      ## License
    
      This dataset is available under the [CC BY 4.0 license](https://creativecommons.org/licenses/CC BY 4.0).
    
  14. Z

    Data pipeline Validation And Load Testing using Multiple CSV Files

    • data.niaid.nih.gov
    • data.europa.eu
    Updated Mar 26, 2021
    Share
    FacebookFacebook
    TwitterTwitter
    Email
    Click to copy link
    Link copied
    Close
    Cite
    Mainak Adhikari; Afsana Khan; Pelle Jakovits (2021). Data pipeline Validation And Load Testing using Multiple CSV Files [Dataset]. https://data.niaid.nih.gov/resources?id=zenodo_4636797
    Explore at:
    Dataset updated
    Mar 26, 2021
    Dataset provided by
    Lecturer, University of Tartu
    Research Fellow, University of Tartu
    Masters Student, University of Tartu
    Authors
    Mainak Adhikari; Afsana Khan; Pelle Jakovits
    License

    Attribution 4.0 (CC BY 4.0)https://creativecommons.org/licenses/by/4.0/
    License information was derived automatically

    Description

    The datasets were used to validate and test the data pipeline deployment following the RADON approach. The dataset has a CSV file that contains around 32000 Twitter tweets. 100 CSV files have been created from the single CSV file and each CSV file containing 320 tweets. Those 100 CSV files are used to validate and test (performance/load testing) the data pipeline components.

  15. f

    Comparing different k-fold cross-validation and the percentage of...

    • datasetcatalog.nlm.nih.gov
    • figshare.com
    Updated Dec 14, 2022
    Share
    FacebookFacebook
    TwitterTwitter
    Email
    Click to copy link
    Link copied
    Close
    Cite
    Chien, C. -S.; Chen, Y. -H.; Shih, Y. -T.; Tsai, C. -S. (2022). Comparing different k-fold cross-validation and the percentage of independent sampling data for testing (evaluating) the GCNMLP. [Dataset]. https://datasetcatalog.nlm.nih.gov/dataset?q=0000315181
    Explore at:
    Dataset updated
    Dec 14, 2022
    Authors
    Chien, C. -S.; Chen, Y. -H.; Shih, Y. -T.; Tsai, C. -S.
    Description

    Comparing different k-fold cross-validation and the percentage of independent sampling data for testing (evaluating) the GCNMLP.

  16. d

    Data from: Wave Tank Testing Report for Controls Validation of a Heaving...

    • catalog.data.gov
    • data.openei.org
    • +3more
    Updated Jan 20, 2025
    + more versions
    Share
    FacebookFacebook
    TwitterTwitter
    Email
    Click to copy link
    Link copied
    Close
    Cite
    Re Vision Consulting (2025). Wave Tank Testing Report for Controls Validation of a Heaving Point Absorber [Dataset]. https://catalog.data.gov/dataset/wave-tank-testing-report-for-controls-validation-of-a-heaving-point-absorber-0ff6c
    Explore at:
    Dataset updated
    Jan 20, 2025
    Dataset provided by
    Re Vision Consulting
    Description

    The core objectives of this project is to improve the power capture of three different wave energy conversion (WEC) devices by more than 50% using an advanced control system and validate the attained improvements using wave tank and full scale testing. In parallel, we will bring along the development of a wave prediction system that is required to enable effective control and test it at full scale. The purposes of this report are to: 1. Plan and document the 1/25th scale device testing at the wave-tank facility; 2. Document the test article, setup and methodology, sensor and instrumentation, mooring, electronics, wiring, and data flow and quality assurance; 3. Communicate the testing results between the associated members; 4. Facilitate reviews that will help to ensure all aspects (risk, safety, testing procedures, etc.); 5. Provide a systematic guide to setting up, executing and decommissioning the experiment.

  17. Brain Stroke Images

    • kaggle.com
    zip
    Updated Dec 14, 2023
    Share
    FacebookFacebook
    TwitterTwitter
    Email
    Click to copy link
    Link copied
    Close
    Cite
    Ayush Tibrewal (2023). Brain Stroke Images [Dataset]. https://www.kaggle.com/datasets/ayushtibrewal/brain-stroke-images/discussion
    Explore at:
    zip(69011379 bytes)Available download formats
    Dataset updated
    Dec 14, 2023
    Authors
    Ayush Tibrewal
    Description

    The Data Explorer Version 1 dataset is a collection of images organized into two main categories: "stroke_cropped" and "stroke_noncropped." Each category is further subdivided into subsets for testing, training, and validation purposes.

    1. stroke_cropped:

      • CROPPED:
        • TEST_CROP
        • TRAIN_CROP
        • VAL_CROP
    2. stroke_noncropped:

      • NON_CROPPED:
        • TEST
        • TRAIN
        • VAL

    Description: The dataset primarily focuses on stroke-related images, categorized into cropped and non-cropped versions. In the "stroke_cropped" category, the images have undergone a cropping process, with subsets specifically designated for testing (TEST_CROP), training (TRAIN_CROP), and validation (VAL_CROP) purposes. On the other hand, the "stroke_noncropped" category contains images in their original, non-cropped form, with subsets similarly allocated for testing, training, and validation (TEST, TRAIN, VAL).

    The dataset size is approximately 73.4 MB. Researchers, developers, or practitioners interested in stroke-related image analysis and classification tasks may find this dataset useful for training and evaluating machine learning models. The inclusion of both cropped and non-cropped versions allows for a diverse range of experiments and applications, catering to different aspects of stroke-related image processing. It is recommended to review the specific subsets based on the task at hand, whether it be testing, training, or validation, to ensure proper use and interpretation of the dataset.

    The key difference between the "cropped" and "non-cropped" versions of the dataset lies in the preprocessing applied to the images.

    1. Cropped:

      • Images in the "CROPPED" category have undergone a cropping process, where a portion of the original image has been selected or extracted.
      • This cropping may be performed to focus on specific regions of interest within the image, excluding unnecessary or irrelevant background information.
      • Cropped images are often used to highlight and emphasize particular features, making it potentially easier for machine learning models to learn and classify relevant patterns.
    2. Non-Cropped:

      • Images in the "NON_CROPPED" category are presented in their original form without any cropping applied.
      • These images contain the entire scene or object captured by the original image, providing a broader context for analysis.
      • Non-cropped images might contain more background information, and the relevant features for analysis are not isolated or emphasized as they are in the cropped versions.

    Use Cases: - The choice between cropped and non-cropped images depends on the specific goals of a machine learning task. If the objective is to focus on detailed features within a limited region, cropped images might be more suitable. - On the other hand, if a comprehensive understanding of the entire scene is crucial, non-cropped images may be preferred.

    Researchers and practitioners may experiment with both versions based on their specific image analysis objectives and the requirements of their machine learning models. The inclusion of both cropped and non-cropped datasets provides flexibility for different use cases and research scenarios.

  18. Z

    Testing dataset for the "long-bone-diaphyseal-CSG-Toolkit"

    • data.niaid.nih.gov
    Updated Jan 21, 2020
    Share
    FacebookFacebook
    TwitterTwitter
    Email
    Click to copy link
    Link copied
    Close
    Cite
    Bertsatos Andreas; Chovalopoulou Maria-Eleni (2020). Testing dataset for the "long-bone-diaphyseal-CSG-Toolkit" [Dataset]. https://data.niaid.nih.gov/resources?id=zenodo_1466961
    Explore at:
    Dataset updated
    Jan 21, 2020
    Dataset provided by
    Department of Animal and Human Physiology, National and Kapodistrian University of Athens
    Authors
    Bertsatos Andreas; Chovalopoulou Maria-Eleni
    License

    Attribution-NonCommercial-NoDerivs 4.0 (CC BY-NC-ND 4.0)https://creativecommons.org/licenses/by-nc-nd/4.0/
    License information was derived automatically

    Description

    The present dataset has been used for the validation study of correct operation for the "long-bone-diaphyseal-CSG-Toolkit". It consists of three 3D mesh bone models (a humerus, a femur and a tibia, which are part of the Athens modern reference skeletal collection) used for comparison to alternative methods for calculating CSG properties of long bones and one 3D mesh ground model (with known geometric properties) used as a gold standard reference.

    Additionally, the dataset includes all the results (stored in the respective csv files) from analyzing each of these models with the GNU Octave CSG Toolkit v1.0.1. The present dataset acts both as supplementary material to the validation study and as a sample dataset for user testing of the operation of the GNU Octave CSG Toolkit.

  19. Automated Cryptographic Validation Test System Generators and Validators

    • catalog.data.gov
    • data.nist.gov
    • +1more
    Updated Jul 29, 2022
    Share
    FacebookFacebook
    TwitterTwitter
    Email
    Click to copy link
    Link copied
    Close
    Cite
    National Institute of Standards and Technology (2022). Automated Cryptographic Validation Test System Generators and Validators [Dataset]. https://catalog.data.gov/dataset/automated-cryptographic-validation-test-system-generators-and-validators
    Explore at:
    Dataset updated
    Jul 29, 2022
    Dataset provided by
    National Institute of Standards and Technologyhttp://www.nist.gov/
    Description

    This is a program that takes in a description of a cryptographic algorithm implementation's capabilities, and generates test vectors to ensure the implementation conforms to the standard. After generating the test vectors, the program also validates the correctness of the responses from the user.

  20. t

    FAIR Dataset for Disease Prediction in Healthcare Applications

    • test.researchdata.tuwien.ac.at
    bin, csv, json, png
    Updated Apr 14, 2025
    Share
    FacebookFacebook
    TwitterTwitter
    Email
    Click to copy link
    Link copied
    Close
    Cite
    Sufyan Yousaf; Sufyan Yousaf; Sufyan Yousaf; Sufyan Yousaf (2025). FAIR Dataset for Disease Prediction in Healthcare Applications [Dataset]. http://doi.org/10.70124/5n77a-dnf02
    Explore at:
    csv, json, bin, pngAvailable download formats
    Dataset updated
    Apr 14, 2025
    Dataset provided by
    TU Wien
    Authors
    Sufyan Yousaf; Sufyan Yousaf; Sufyan Yousaf; Sufyan Yousaf
    License

    Attribution 4.0 (CC BY 4.0)https://creativecommons.org/licenses/by/4.0/
    License information was derived automatically

    Description

    Dataset Description

    Context and Methodology

    • Research Domain/Project:
      This dataset was created for a machine learning experiment aimed at developing a classification model to predict outcomes based on a set of features. The primary research domain is disease prediction in patients. The dataset was used in the context of training, validating, and testing.

    • Purpose of the Dataset:
      The purpose of this dataset is to provide training, validation, and testing data for the development of machine learning models. It includes labeled examples that help train classifiers to recognize patterns in the data and make predictions.

    • Dataset Creation:
      Data preprocessing steps involved cleaning, normalization, and splitting the data into training, validation, and test sets. The data was carefully curated to ensure its quality and relevance to the problem at hand. For any missing values or outliers, appropriate handling techniques were applied (e.g., imputation, removal, etc.).

    Technical Details

    • Structure of the Dataset:
      The dataset consists of several files organized into folders by data type:

      • Training Data: Contains the training dataset used to train the machine learning model.

      • Validation Data: Used for hyperparameter tuning and model selection.

      • Test Data: Reserved for final model evaluation.

      Each folder contains files with consistent naming conventions for easy navigation, such as train_data.csv, validation_data.csv, and test_data.csv. Each file follows a tabular format with columns representing features and rows representing individual data points.

    • Software Requirements:
      To open and work with this dataset, you need VS Code or Jupyter, which could include tools like:

      • Python (with libraries such as pandas, numpy, scikit-learn, matplotlib, etc.)

    Further Details

    • Reusability:
      Users of this dataset should be aware that it is designed for machine learning experiments involving classification tasks. The dataset is already split into training, validation, and test subsets. Any model trained with this dataset should be evaluated using the test set to ensure proper validation.

    • Limitations:
      The dataset may not cover all edge cases, and it might have biases depending on the selection of data sources. It's important to consider these limitations when generalizing model results to real-world applications.

Share
FacebookFacebook
TwitterTwitter
Email
Click to copy link
Link copied
Close
Cite
Chrysthian Chrisley (2024). Images used for training, validation, and testing. [Dataset]. https://www.kaggle.com/datasets/chrysthian/images-used-for-training-validation-and-testing
Organization logo

Images used for training, validation, and testing.

Cheetah and Jaguar images.

Explore at:
224 scholarly articles cite this dataset (View in Google Scholar)
CroissantCroissant is a format for machine-learning datasets. Learn more about this at mlcommons.org/croissant.
Dataset updated
Mar 15, 2024
Dataset provided by
Kaggle
Authors
Chrysthian Chrisley
License

Attribution-ShareAlike 3.0 (CC BY-SA 3.0)https://creativecommons.org/licenses/by-sa/3.0/
License information was derived automatically

Description

Imports:

# All Imports
import os
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.calibration import LabelEncoder
import seaborn as sns
import matplotlib.image as mpimg
import cv2
import numpy as np
import pickle

# Tensflor and Keras Layer and Model and Optimize and Loss
import tensorflow as tf
from tensorflow import keras
from keras import Sequential
from keras.layers import *

#Kernel Intilizer 
from keras.optimizers import Adamax

# PreTrained Model
from keras.applications import *

#Early Stopping
from keras.callbacks import EarlyStopping
import warnings 

Warnings Suppression | Configuration

# Warnings Remove 
warnings.filterwarnings("ignore")

# Define the base path for the training folder
base_path = 'jaguar_cheetah/train'

# Weights file
weights_file = 'Model_train_weights.weights.h5'

# Path to the saved or to save the model:
model_file = 'Model-cheetah_jaguar_Treined.keras'

# Model history
history_path = 'training_history_cheetah_jaguar.pkl'

# Initialize lists to store file paths and labels
filepaths = []
labels = []

# Iterate over folders and files within the training directory
for folder in ['Cheetah', 'Jaguar']:
  folder_path = os.path.join(base_path, folder)
  for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    filepaths.append(file_path)
    labels.append(folder)

# Create the TRAINING dataframe
file_path_series = pd.Series(filepaths , name= 'filepath')
Label_path_series = pd.Series(labels , name = 'label')
df_train = pd.concat([file_path_series ,Label_path_series ] , axis = 1)


# Define the base path for the test folder
directory = "jaguar_cheetah/test"

filepath =[]
label = []

folds = os.listdir(directory)

for fold in folds:
  f_path = os.path.join(directory , fold)
  
  imgs = os.listdir(f_path)
  
  for img in imgs:
    
    img_path = os.path.join(f_path , img)
    filepath.append(img_path)
    label.append(fold)
    
# Create the TEST dataframe
file_path_series = pd.Series(filepath , name= 'filepath')
Label_path_series = pd.Series(label , name = 'label')
df_test = pd.concat([file_path_series ,Label_path_series ] , axis = 1)

# Display the first rows of the dataframe for verification
#print(df_train)

# Folders with Training and Test files
data_dir = 'jaguar_cheetah/train'
test_dir = 'jaguar_cheetah/test'

# Image size 256x256
IMAGE_SIZE = (256,256) 

Tain | Test

#print('Training Images:')

# Create the TRAIN dataframe
train_ds = tf.keras.utils.image_dataset_from_directory(
  data_dir,
  validation_split=0.1,
  subset='training',
  seed=123,
  image_size=IMAGE_SIZE,
  batch_size=32)

#Testing Data
#print('Validation Images:')
validation_ds = tf.keras.utils.image_dataset_from_directory(
  data_dir, 
  validation_split=0.1,
  subset='validation',
  seed=123,
  image_size=IMAGE_SIZE,
  batch_size=32)

print('Testing Images:')
test_ds = tf.keras.utils.image_dataset_from_directory(
  test_dir, 
  seed=123,
  image_size=IMAGE_SIZE,
  batch_size=32)
# Extract labels
train_labels = train_ds.class_names
test_labels = test_ds.class_names
validation_labels = validation_ds.class_names

# Encode labels
# Defining the class labels
class_labels = ['CHEETAH', 'JAGUAR'] 

# Instantiate (encoder) LabelEncoder
label_encoder = LabelEncoder()

# Fit the label encoder on the class labels
label_encoder.fit(class_labels)

# Transform the labels for the training dataset
train_labels_encoded = label_encoder.transform(train_labels)

# Transform the labels for the validation dataset
validation_labels_encoded = label_encoder.transform(validation_labels)

# Transform the labels for the testing dataset
test_labels_encoded = label_encoder.transform(test_labels)

# Normalize the pixel values

# Train files 
train_ds = train_ds.map(lambda x, y: (x / 255.0, y))
# Validate files
validation_ds = validation_ds.map(lambda x, y: (x / 255.0, y))
# Test files
test_ds = test_ds.map(lambda x, y: (x / 255.0, y))

#TRAINING VISUALIZATION
#Count the occurrences of each category in the column
count = df_train['label'].value_counts()

# Create a figure with 2 subplots
fig, axs = plt.subplots(1, 2, figsize=(12, 6), facecolor='white')

# Plot a pie chart on the first subplot
palette = sns.color_palette("viridis")
sns.set_palette(palette)
axs[0].pie(count, labels=count.index, autopct='%1.1f%%', startangle=140)
axs[0].set_title('Distribution of Training Categories')

# Plot a bar chart on the second subplot
sns.barplot(x=count.index, y=count.values, ax=axs[1], palette="viridis")
axs[1].set_title('Count of Training Categories')

# Adjust the layout
plt.tight_layout()

# Visualize
plt.show()

# TEST VISUALIZATION
count = df_test['label'].value_counts()

# Create a figure with 2 subplots
fig, axs = plt.subplots(1, 2, figsize=(12, 6), facec...
Search
Clear search
Close search
Google apps
Main menu