7 datasets found
  1. FacialRecognition

    • kaggle.com
    zip
    Updated Dec 1, 2016
    Share
    FacebookFacebook
    TwitterTwitter
    Email
    Click to copy link
    Link copied
    Close
    Cite
    TheNicelander (2016). FacialRecognition [Dataset]. https://www.kaggle.com/petein/facialrecognition
    Explore at:
    zip(121674455 bytes)Available download formats
    Dataset updated
    Dec 1, 2016
    Authors
    TheNicelander
    License

    http://opendatacommons.org/licenses/dbcl/1.0/http://opendatacommons.org/licenses/dbcl/1.0/

    Description

    #https://www.kaggle.com/c/facial-keypoints-detection/details/getting-started-with-r #################################

    ###Variables for downloaded files data.dir <- ' ' train.file <- paste0(data.dir, 'training.csv') test.file <- paste0(data.dir, 'test.csv') #################################

    ###Load csv -- creates a data.frame matrix where each column can have a different type. d.train <- read.csv(train.file, stringsAsFactors = F) d.test <- read.csv(test.file, stringsAsFactors = F)

    ###In training.csv, we have 7049 rows, each one with 31 columns. ###The first 30 columns are keypoint locations, which R correctly identified as numbers. ###The last one is a string representation of the image, identified as a string.

    ###To look at samples of the data, uncomment this line:

    head(d.train)

    ###Let's save the first column as another variable, and remove it from d.train: ###d.train is our dataframe, and we want the column called Image. ###Assigning NULL to a column removes it from the dataframe

    im.train <- d.train$Image d.train$Image <- NULL #removes 'image' from the dataframe

    im.test <- d.test$Image d.test$Image <- NULL #removes 'image' from the dataframe

    ################################# #The image is represented as a series of numbers, stored as a string #Convert these strings to integers by splitting them and converting the result to integer

    #strsplit splits the string #unlist simplifies its output to a vector of strings #as.integer converts it to a vector of integers. as.integer(unlist(strsplit(im.train[1], " "))) as.integer(unlist(strsplit(im.test[1], " ")))

    ###Install and activate appropriate libraries ###The tutorial is meant for Linux and OSx, where they use a different library, so: ###Replace all instances of %dopar% with %do%.

    install.packages('foreach')

    library("foreach", lib.loc="~/R/win-library/3.3")

    ###implement parallelization im.train <- foreach(im = im.train, .combine=rbind) %do% { as.integer(unlist(strsplit(im, " "))) } im.test <- foreach(im = im.test, .combine=rbind) %do% { as.integer(unlist(strsplit(im, " "))) } #The foreach loop will evaluate the inner command for each row in im.train, and combine the results with rbind (combine by rows). #%do% instructs R to do all evaluations in parallel. #im.train is now a matrix with 7049 rows (one for each image) and 9216 columns (one for each pixel):

    ###Save all four variables in data.Rd file ###Can reload them at anytime with load('data.Rd')

    save(d.train, im.train, d.test, im.test, file='data.Rd')

    load('data.Rd')

    #each image is a vector of 96*96 pixels (96*96 = 9216). #convert these 9216 integers into a 96x96 matrix: im <- matrix(data=rev(im.train[1,]), nrow=96, ncol=96)

    #im.train[1,] returns the first row of im.train, which corresponds to the first training image. #rev reverse the resulting vector to match the interpretation of R's image function #(which expects the origin to be in the lower left corner).

    #To visualize the image we use R's image function: image(1:96, 1:96, im, col=gray((0:255)/255))

    #Let’s color the coordinates for the eyes and nose points(96-d.train$nose_tip_x[1], 96-d.train$nose_tip_y[1], col="red") points(96-d.train$left_eye_center_x[1], 96-d.train$left_eye_center_y[1], col="blue") points(96-d.train$right_eye_center_x[1], 96-d.train$right_eye_center_y[1], col="green")

    #Another good check is to see how variable is our data. #For example, where are the centers of each nose in the 7049 images? (this takes a while to run): for(i in 1:nrow(d.train)) { points(96-d.train$nose_tip_x[i], 96-d.train$nose_tip_y[i], col="red") }

    #there are quite a few outliers -- they could be labeling errors. Looking at one extreme example we get this: #In this case there's no labeling error, but this shows that not all faces are centralized idx <- which.max(d.train$nose_tip_x) im <- matrix(data=rev(im.train[idx,]), nrow=96, ncol=96) image(1:96, 1:96, im, col=gray((0:255)/255)) points(96-d.train$nose_tip_x[idx], 96-d.train$nose_tip_y[idx], col="red")

    #One of the simplest things to try is to compute the mean of the coordinates of each keypoint in the training set and use that as a prediction for all images colMeans(d.train, na.rm=T)

    #To build a submission file we need to apply these computed coordinates to the test instances: p <- matrix(data=colMeans(d.train, na.rm=T), nrow=nrow(d.test), ncol=ncol(d.train), byrow=T) colnames(p) <- names(d.train) predictions <- data.frame(ImageId = 1:nrow(d.test), p) head(predictions)

    #The expected submission format has one one keypoint per row, but we can easily get that with the help of the reshape2 library:

    install.packages('reshape2')

    library(...

  2. m

    R codes and dataset for Visualisation of Diachronic Constructional Change...

    • bridges.monash.edu
    • researchdata.edu.au
    zip
    Updated May 30, 2023
    Share
    FacebookFacebook
    TwitterTwitter
    Email
    Click to copy link
    Link copied
    Close
    Cite
    Gede Primahadi Wijaya Rajeg (2023). R codes and dataset for Visualisation of Diachronic Constructional Change using Motion Chart [Dataset]. http://doi.org/10.26180/5c844c7a81768
    Explore at:
    zipAvailable download formats
    Dataset updated
    May 30, 2023
    Dataset provided by
    Monash University
    Authors
    Gede Primahadi Wijaya Rajeg
    License

    Attribution-NonCommercial-ShareAlike 4.0 (CC BY-NC-SA 4.0)https://creativecommons.org/licenses/by-nc-sa/4.0/
    License information was derived automatically

    Description

    PublicationPrimahadi Wijaya R., Gede. 2014. Visualisation of diachronic constructional change using Motion Chart. In Zane Goebel, J. Herudjati Purwoko, Suharno, M. Suryadi & Yusuf Al Aried (eds.). Proceedings: International Seminar on Language Maintenance and Shift IV (LAMAS IV), 267-270. Semarang: Universitas Diponegoro. doi: https://doi.org/10.4225/03/58f5c23dd8387Description of R codes and data files in the repositoryThis repository is imported from its GitHub repo. Versioning of this figshare repository is associated with the GitHub repo's Release. So, check the Releases page for updates (the next version is to include the unified version of the codes in the first release with the tidyverse).The raw input data consists of two files (i.e. will_INF.txt and go_INF.txt). They represent the co-occurrence frequency of top-200 infinitival collocates for will and be going to respectively across the twenty decades of Corpus of Historical American English (from the 1810s to the 2000s).These two input files are used in the R code file 1-script-create-input-data-raw.r. The codes preprocess and combine the two files into a long format data frame consisting of the following columns: (i) decade, (ii) coll (for "collocate"), (iii) BE going to (for frequency of the collocates with be going to) and (iv) will (for frequency of the collocates with will); it is available in the input_data_raw.txt. Then, the script 2-script-create-motion-chart-input-data.R processes the input_data_raw.txt for normalising the co-occurrence frequency of the collocates per million words (the COHA size and normalising base frequency are available in coha_size.txt). The output from the second script is input_data_futurate.txt.Next, input_data_futurate.txt contains the relevant input data for generating (i) the static motion chart as an image plot in the publication (using the script 3-script-create-motion-chart-plot.R), and (ii) the dynamic motion chart (using the script 4-script-motion-chart-dynamic.R).The repository adopts the project-oriented workflow in RStudio; double-click on the Future Constructions.Rproj file to open an RStudio session whose working directory is associated with the contents of this repository.

  3. Scripts and data from Beyond Perfect Synchrony: Shared Interpersonal...

    • rs.figshare.com
    zip
    Updated Mar 18, 2025
    Share
    FacebookFacebook
    TwitterTwitter
    Email
    Click to copy link
    Link copied
    Close
    Cite
    Dhwani P. Sadaphal; Christian Reinhard Blum; Peter Keller; Tecumseh Fitch (2025). Scripts and data from Beyond Perfect Synchrony: Shared Interpersonal Rhythmic Timing Enhances Self-Other Merging Judgements [Dataset]. http://doi.org/10.6084/m9.figshare.28564235.v1
    Explore at:
    zipAvailable download formats
    Dataset updated
    Mar 18, 2025
    Dataset provided by
    Royal Societyhttp://royalsociety.org/
    Authors
    Dhwani P. Sadaphal; Christian Reinhard Blum; Peter Keller; Tecumseh Fitch
    License

    Attribution 4.0 (CC BY 4.0)https://creativecommons.org/licenses/by/4.0/
    License information was derived automatically

    Description

    Analysis and plotting R scripts used in both experiments with R workspaces complete with dataframes and results.

  4. Rcode – Custom code written the R programming language that will translate...

    • plos.figshare.com
    txt
    Updated Nov 19, 2025
    Share
    FacebookFacebook
    TwitterTwitter
    Email
    Click to copy link
    Link copied
    Close
    Cite
    Anthony Nearman; Alriana Buller-Jarrett; Dawn Boncristiani; Eugene Ryabov; Yanping Chen; Jay D. Evans (2025). Rcode – Custom code written the R programming language that will translate an open reading frame for an existing sequence, then compare it to a data frame of nucleotide polymorphisms at specific locations, and retranslate the amino acid changes into a new data frame. [Dataset]. http://doi.org/10.1371/journal.pone.0337191.s009
    Explore at:
    txtAvailable download formats
    Dataset updated
    Nov 19, 2025
    Dataset provided by
    PLOShttp://plos.org/
    Authors
    Anthony Nearman; Alriana Buller-Jarrett; Dawn Boncristiani; Eugene Ryabov; Yanping Chen; Jay D. Evans
    License

    Attribution 4.0 (CC BY 4.0)https://creativecommons.org/licenses/by/4.0/
    License information was derived automatically

    Description

    Rcode – Custom code written the R programming language that will translate an open reading frame for an existing sequence, then compare it to a data frame of nucleotide polymorphisms at specific locations, and retranslate the amino acid changes into a new data frame.

  5. Time Series Forecasting Using Prophet in R

    • kaggle.com
    zip
    Updated Jul 25, 2023
    Share
    FacebookFacebook
    TwitterTwitter
    Email
    Click to copy link
    Link copied
    Close
    Cite
    vikram amin (2023). Time Series Forecasting Using Prophet in R [Dataset]. https://www.kaggle.com/datasets/vikramamin/time-series-forecasting-using-prophet-in-r
    Explore at:
    zip(9000 bytes)Available download formats
    Dataset updated
    Jul 25, 2023
    Authors
    vikram amin
    License

    https://creativecommons.org/publicdomain/zero/1.0/https://creativecommons.org/publicdomain/zero/1.0/

    Description
    • Main objective : To forecast the page visits of a website
    • Tool : Time Series Forecasting using Prophet in R.
    • Steps:
    • Read the data
    • Data Cleaning: Checking data types, date formats and missing data https://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F10868729%2F56d7b1edf4f51157804e81b02c032e4d%2FPicture1.png?generation=1690271521103777&alt=media" alt="">
    • Run libraries (dplyr, ggplot2, tidyverse, lubridate, prophet, forecast)
    • Change the Date column from character vector to date and change data format using lubridate package
    • Rename the column "Date" to "ds" and "Visits" to "y".
    • Treat "Christmas" and "Black.Friday" as holiday events. As the data ranges from 2016 to 2020, there will be 5 Christmas and 5 Black Friday days.
    • We will look at the impact of Christmas 3 days prior and 3 days later from Christmas date on "Visits" and 3 days prior and 1 day later for Black Friday
    • We create two data frames called Christmas and Black.Friday and merge the two into a data frame called "holidays". https://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F10868729%2Fd07b366be2050fefe6a62563b6abac0c%2FPicture2.png?generation=1690272066356516&alt=media" alt="">
    • We create train and test data. In train data & test data, we select only 3 variables namely ds, y , Easter. In train data, ds contains data before 2020-12-01 and test data contains data equal to and after 2020-12-01 (31 days) data
    • Train Data
    • https://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F10868729%2F8f3f58fe40b29b276bb7103cb1dfdde1%2FPicture3.png?generation=1690272272038405&alt=media" alt="">
    • Test Data
    • https://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F10868729%2Fb4362117f46aeb210dad23f07d3ecb39%2FPicture4.png?generation=1690272400355824&alt=media" alt="">
    • Use prophet model which will include multiple parameter. We are going with the default parameters. Thereafter, we add the external regressor "Easter".
    • https://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F10868729%2F7325be63d887372cc5764ddf29a94310%2FPicture5.png?generation=1690272892963939&alt=media" alt="">
    • We create the future data frame for forecasting and name the data frame "future". It will include "m" and 31 days of the test data. We then predict this future data frame and create a new data frame called "forecast".
    • Forecast data frame consists of 1827 rows and 34 variables. This shows the external Regressor (Easter) value is 0 through the entire time period. This shows that "Easter" has no impact or effect on "Visits".
    • yhat stands for the predicted value (predicted visits).
    • https://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F10868729%2Fae5c9414d1b1bbb2670b372a326970a5%2FPicture6.png?generation=1690273558489681&alt=media" alt="">
    • We try to understand the impact of Holiday events "Christmas" and "Black.Friday"
    • https://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F10868729%2F5a36cc5308f9e46f0b63fa8e37c4b932%2FPicture7.png?generation=1690273814760538&alt=media" alt="">
    • https://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F10868729%2F8cc3dd0581db1e8b9d542d9a524abd39%2FPicture8.png?generation=1690273879506571&alt=media" alt="">
    • We plot the forecast.
    • plot(m,forecast) https://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F10868729%2Fa7968ff05abdd5b4e789f3723b41c4ed%2FPicture9.png?generation=1690274020880594&alt=media" alt="">
    • blue is predicted value(yhat) and black is actual value(y) and blue shaded regions are the yhat_upper and yhat_lower values
    • prophet_plot_components(m,forecast) https://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F10868729%2F52408afb8c71118ef6729420085875e8%2FPicture10.png?generation=1690274184325240&alt=media" alt="">
    • Trend indicates that the page visits remained constant from Jan'16 to Mid'17 and thereafter there was an upswing from Mid'19 to End of 2020
    • From Holidays, we can make out that Christmas had a negative effect on page visits whereas Black Friday had a positive effect on page visits
    • Weekly seasonality indicates that page visits tend to remain the highest from Monday to Thursday and starts going down thereafter
    • Yearly seasonality indicates that page visits are the highest in Apr and then starts going down thereafter with
    • Oct having reaching the bottom point
    • External regressor "Easter" has no impact on page visits
    • plot(m,forecast) + add_changepoints_to_plot(m)
    • https://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F10868729%2F1253a0e381ae04d3156a4b098dafb2ca%2FPicture11.png?generation=1690274373570449&alt=media" alt="">
    • Trend which is indicated by the red line starts moving upwards from Mid 2019 to 2020 onwards
    • We check for acc...
  6. f

    Greiser et al. (2023) Higher soil moisture increases microclimate...

    • su.figshare.com
    • demo.researchdata.se
    • +2more
    txt
    Updated Dec 4, 2023
    Share
    FacebookFacebook
    TwitterTwitter
    Email
    Click to copy link
    Link copied
    Close
    Cite
    Caroline Greiser; Martin Kopecký; Martin Macek; Lucia Hederová; Giulia Vico; Jan Wild (2023). Greiser et al. (2023) Higher soil moisture increases microclimate temperature buffering in temperate broadleaf forests - Data and Code [Dataset]. http://doi.org/10.17045/sthlmuni.24247090.v1
    Explore at:
    txtAvailable download formats
    Dataset updated
    Dec 4, 2023
    Dataset provided by
    Stockholm University
    Authors
    Caroline Greiser; Martin Kopecký; Martin Macek; Lucia Hederová; Giulia Vico; Jan Wild
    License

    Attribution 4.0 (CC BY 4.0)https://creativecommons.org/licenses/by/4.0/
    License information was derived automatically

    Description

    Abstract of the article to which the data and code belong:Forest canopies can buffer the understory against temperature extremes often creating cooler microclimates during warm summer days compared to temperatures outside the forest. The buffering of maximum temperatures in the understory results from a combination of canopy shading and air cooling through soil water evaporation and plant transpiration. Therefore, buffering capacity of forests depends on canopy cover and soil moisture content, which are increasingly affected by more frequent and severe canopy disturbances and soil droughts. The extent to which this buffering will be maintained in future conditions is unclear due to the lack of understanding about the relationship between soil moisture and air temperature buffering in interaction with canopy cover and topographic settings. We explored how soil moisture variability affects temperature offsets between outside and inside the forest on a daily basis, using temperature and soil moisture data from 54 sites in temperate broadleaf forests in Central Europe over four climatically different summer seasons. Daily maximum temperatures in forest understories were on average 2 °C cooler than outside temperatures. The buffering of understory temperatures was more effective when soil moisture was higher, and the offsets were more sensitive to soil moisture on sites with drier soils and on sun-exposed slopes with high topographic heat load. Based on these results, the soil-water limitation to forest temperature buffering will become more prevalent under future warmer conditions and will likely lead to changes in understory communities. Thus, our results highlight the urgent need to include soil moisture in models and predictions of forest microclimate, understory biodiversity and tree regeneration, to provide a more precise estimate of the effects of climate change.List of files:02_model_offset_from_soilmoist_rev.r => R-script for the statistical analysismodel_data_complete_figshare.csv => cleaned and complete data for the statistical analysismodel_data_4thday_figshare.csv => cleaned and "thinned" data for the statistical analysisREADME.txt => metadata describing columns in the dataframes and the environment of the R-script

  7. Google Data Analytics Case Study Cyclistic

    • kaggle.com
    zip
    Updated Sep 27, 2022
    + more versions
    Share
    FacebookFacebook
    TwitterTwitter
    Email
    Click to copy link
    Link copied
    Close
    Cite
    Udayakumar19 (2022). Google Data Analytics Case Study Cyclistic [Dataset]. https://www.kaggle.com/datasets/udayakumar19/google-data-analytics-case-study-cyclistic/suggestions
    Explore at:
    zip(1299 bytes)Available download formats
    Dataset updated
    Sep 27, 2022
    Authors
    Udayakumar19
    Description

    Introduction

    Welcome to the Cyclistic bike-share analysis case study! In this case study, you will perform many real-world tasks of a junior data analyst. You will work for a fictional company, Cyclistic, and meet different characters and team members. In order to answer the key business questions, you will follow the steps of the data analysis process: ask, prepare, process, analyze, share, and act. Along the way, the Case Study Roadmap tables — including guiding questions and key tasks — will help you stay on the right path.

    Scenario

    You are a junior data analyst working in the marketing analyst team at Cyclistic, a bike-share company in Chicago. The director of marketing believes the company’s future success depends on maximizing the number of annual memberships. Therefore, your team wants to understand how casual riders and annual members use Cyclistic bikes differently. From these insights, your team will design a new marketing strategy to convert casual riders into annual members. But first, Cyclistic executives must approve your recommendations, so they must be backed up with compelling data insights and professional data visualizations.

    Ask

    How do annual members and casual riders use Cyclistic bikes differently?

    Guiding Question:

    What is the problem you are trying to solve?
      How do annual members and casual riders use Cyclistic bikes differently?
    How can your insights drive business decisions?
      The insight will help the marketing team to make a strategy for casual riders
    

    Prepare

    Guiding Question:

    Where is your data located?
      Data located in Cyclistic organization data.
    
    How is data organized?
      Dataset are in csv format for each month wise from Financial year 22.
    
    Are there issues with bias or credibility in this data? Does your data ROCCC? 
      It is good it is ROCCC because data collected in from Cyclistic organization.
    
    How are you addressing licensing, privacy, security, and accessibility?
      The company has their own license over the dataset. Dataset does not have any personal information about the riders.
    
    How did you verify the data’s integrity?
      All the files have consistent columns and each column has the correct type of data.
    
    How does it help you answer your questions?
      Insights always hidden in the data. We have the interpret with data to find the insights.
    
    Are there any problems with the data?
      Yes, starting station names, ending station names have null values.
    

    Process

    Guiding Question:

    What tools are you choosing and why?
      I used R studio for the cleaning and transforming the data for analysis phase because of large dataset and to gather experience in the language.
    
    Have you ensured the data’s integrity?
     Yes, the data is consistent throughout the columns.
    
    What steps have you taken to ensure that your data is clean?
      First duplicates, null values are removed then added new columns for analysis.
    
    How can you verify that your data is clean and ready to analyze? 
     Make sure the column names are consistent thorough out all data sets by using the “bind row” function.
    
    Make sure column data types are consistent throughout all the dataset by using the “compare_df_col” from the “janitor” package.
    Combine the all dataset into single data frame to make consistent throught the analysis.
    Removed the column start_lat, start_lng, end_lat, end_lng from the dataframe because those columns not required for analysis.
    Create new columns day, date, month, year, from the started_at column this will provide additional opportunities to aggregate the data
    Create the “ride_length” column from the started_at and ended_at column to find the average duration of the ride by the riders.
    Removed the null rows from the dataset by using the “na.omit function”
    Have you documented your cleaning process so you can review and share those results? 
      Yes, the cleaning process is documented clearly.
    

    Analyze Phase:

    Guiding Questions:

    How should you organize your data to perform analysis on it? The data has been organized in one single dataframe by using the read csv function in R Has your data been properly formatted? Yes, all the columns have their correct data type.

    What surprises did you discover in the data?
      Casual member ride duration is higher than the annual members
      Causal member widely uses docked bike than the annual members
    What trends or relationships did you find in the data?
      Annual members are used mainly for commute purpose
      Casual member are preferred the docked bikes
      Annual members are preferred the electric or classic bikes
    How will these insights help answer your business questions?
      This insights helps to build a profile for members
    

    Share

    Guiding Quesions:

    Were you able to answer the question of how ...
    
  8. Not seeing a result you expected?
    Learn how you can add new datasets to our index.

Share
FacebookFacebook
TwitterTwitter
Email
Click to copy link
Link copied
Close
Cite
TheNicelander (2016). FacialRecognition [Dataset]. https://www.kaggle.com/petein/facialrecognition
Organization logo

FacialRecognition

Test environment for FacialRecognition competition

Explore at:
zip(121674455 bytes)Available download formats
Dataset updated
Dec 1, 2016
Authors
TheNicelander
License

http://opendatacommons.org/licenses/dbcl/1.0/http://opendatacommons.org/licenses/dbcl/1.0/

Description

#https://www.kaggle.com/c/facial-keypoints-detection/details/getting-started-with-r #################################

###Variables for downloaded files data.dir <- ' ' train.file <- paste0(data.dir, 'training.csv') test.file <- paste0(data.dir, 'test.csv') #################################

###Load csv -- creates a data.frame matrix where each column can have a different type. d.train <- read.csv(train.file, stringsAsFactors = F) d.test <- read.csv(test.file, stringsAsFactors = F)

###In training.csv, we have 7049 rows, each one with 31 columns. ###The first 30 columns are keypoint locations, which R correctly identified as numbers. ###The last one is a string representation of the image, identified as a string.

###To look at samples of the data, uncomment this line:

head(d.train)

###Let's save the first column as another variable, and remove it from d.train: ###d.train is our dataframe, and we want the column called Image. ###Assigning NULL to a column removes it from the dataframe

im.train <- d.train$Image d.train$Image <- NULL #removes 'image' from the dataframe

im.test <- d.test$Image d.test$Image <- NULL #removes 'image' from the dataframe

################################# #The image is represented as a series of numbers, stored as a string #Convert these strings to integers by splitting them and converting the result to integer

#strsplit splits the string #unlist simplifies its output to a vector of strings #as.integer converts it to a vector of integers. as.integer(unlist(strsplit(im.train[1], " "))) as.integer(unlist(strsplit(im.test[1], " ")))

###Install and activate appropriate libraries ###The tutorial is meant for Linux and OSx, where they use a different library, so: ###Replace all instances of %dopar% with %do%.

install.packages('foreach')

library("foreach", lib.loc="~/R/win-library/3.3")

###implement parallelization im.train <- foreach(im = im.train, .combine=rbind) %do% { as.integer(unlist(strsplit(im, " "))) } im.test <- foreach(im = im.test, .combine=rbind) %do% { as.integer(unlist(strsplit(im, " "))) } #The foreach loop will evaluate the inner command for each row in im.train, and combine the results with rbind (combine by rows). #%do% instructs R to do all evaluations in parallel. #im.train is now a matrix with 7049 rows (one for each image) and 9216 columns (one for each pixel):

###Save all four variables in data.Rd file ###Can reload them at anytime with load('data.Rd')

save(d.train, im.train, d.test, im.test, file='data.Rd')

load('data.Rd')

#each image is a vector of 96*96 pixels (96*96 = 9216). #convert these 9216 integers into a 96x96 matrix: im <- matrix(data=rev(im.train[1,]), nrow=96, ncol=96)

#im.train[1,] returns the first row of im.train, which corresponds to the first training image. #rev reverse the resulting vector to match the interpretation of R's image function #(which expects the origin to be in the lower left corner).

#To visualize the image we use R's image function: image(1:96, 1:96, im, col=gray((0:255)/255))

#Let’s color the coordinates for the eyes and nose points(96-d.train$nose_tip_x[1], 96-d.train$nose_tip_y[1], col="red") points(96-d.train$left_eye_center_x[1], 96-d.train$left_eye_center_y[1], col="blue") points(96-d.train$right_eye_center_x[1], 96-d.train$right_eye_center_y[1], col="green")

#Another good check is to see how variable is our data. #For example, where are the centers of each nose in the 7049 images? (this takes a while to run): for(i in 1:nrow(d.train)) { points(96-d.train$nose_tip_x[i], 96-d.train$nose_tip_y[i], col="red") }

#there are quite a few outliers -- they could be labeling errors. Looking at one extreme example we get this: #In this case there's no labeling error, but this shows that not all faces are centralized idx <- which.max(d.train$nose_tip_x) im <- matrix(data=rev(im.train[idx,]), nrow=96, ncol=96) image(1:96, 1:96, im, col=gray((0:255)/255)) points(96-d.train$nose_tip_x[idx], 96-d.train$nose_tip_y[idx], col="red")

#One of the simplest things to try is to compute the mean of the coordinates of each keypoint in the training set and use that as a prediction for all images colMeans(d.train, na.rm=T)

#To build a submission file we need to apply these computed coordinates to the test instances: p <- matrix(data=colMeans(d.train, na.rm=T), nrow=nrow(d.test), ncol=ncol(d.train), byrow=T) colnames(p) <- names(d.train) predictions <- data.frame(ImageId = 1:nrow(d.test), p) head(predictions)

#The expected submission format has one one keypoint per row, but we can easily get that with the help of the reshape2 library:

install.packages('reshape2')

library(...

Search
Clear search
Close search
Google apps
Main menu