Facebook
Twitterhttp://opendatacommons.org/licenses/dbcl/1.0/http://opendatacommons.org/licenses/dbcl/1.0/
#https://www.kaggle.com/c/facial-keypoints-detection/details/getting-started-with-r #################################
###Variables for downloaded files data.dir <- ' ' train.file <- paste0(data.dir, 'training.csv') test.file <- paste0(data.dir, 'test.csv') #################################
###Load csv -- creates a data.frame matrix where each column can have a different type. d.train <- read.csv(train.file, stringsAsFactors = F) d.test <- read.csv(test.file, stringsAsFactors = F)
###In training.csv, we have 7049 rows, each one with 31 columns. ###The first 30 columns are keypoint locations, which R correctly identified as numbers. ###The last one is a string representation of the image, identified as a string.
###To look at samples of the data, uncomment this line:
###Let's save the first column as another variable, and remove it from d.train: ###d.train is our dataframe, and we want the column called Image. ###Assigning NULL to a column removes it from the dataframe
im.train <- d.train$Image d.train$Image <- NULL #removes 'image' from the dataframe
im.test <- d.test$Image d.test$Image <- NULL #removes 'image' from the dataframe
################################# #The image is represented as a series of numbers, stored as a string #Convert these strings to integers by splitting them and converting the result to integer
#strsplit splits the string #unlist simplifies its output to a vector of strings #as.integer converts it to a vector of integers. as.integer(unlist(strsplit(im.train[1], " "))) as.integer(unlist(strsplit(im.test[1], " ")))
###Install and activate appropriate libraries ###The tutorial is meant for Linux and OSx, where they use a different library, so: ###Replace all instances of %dopar% with %do%.
library("foreach", lib.loc="~/R/win-library/3.3")
###implement parallelization im.train <- foreach(im = im.train, .combine=rbind) %do% { as.integer(unlist(strsplit(im, " "))) } im.test <- foreach(im = im.test, .combine=rbind) %do% { as.integer(unlist(strsplit(im, " "))) } #The foreach loop will evaluate the inner command for each row in im.train, and combine the results with rbind (combine by rows). #%do% instructs R to do all evaluations in parallel. #im.train is now a matrix with 7049 rows (one for each image) and 9216 columns (one for each pixel):
###Save all four variables in data.Rd file ###Can reload them at anytime with load('data.Rd')
#each image is a vector of 96*96 pixels (96*96 = 9216). #convert these 9216 integers into a 96x96 matrix: im <- matrix(data=rev(im.train[1,]), nrow=96, ncol=96)
#im.train[1,] returns the first row of im.train, which corresponds to the first training image. #rev reverse the resulting vector to match the interpretation of R's image function #(which expects the origin to be in the lower left corner).
#To visualize the image we use R's image function: image(1:96, 1:96, im, col=gray((0:255)/255))
#Let’s color the coordinates for the eyes and nose points(96-d.train$nose_tip_x[1], 96-d.train$nose_tip_y[1], col="red") points(96-d.train$left_eye_center_x[1], 96-d.train$left_eye_center_y[1], col="blue") points(96-d.train$right_eye_center_x[1], 96-d.train$right_eye_center_y[1], col="green")
#Another good check is to see how variable is our data. #For example, where are the centers of each nose in the 7049 images? (this takes a while to run): for(i in 1:nrow(d.train)) { points(96-d.train$nose_tip_x[i], 96-d.train$nose_tip_y[i], col="red") }
#there are quite a few outliers -- they could be labeling errors. Looking at one extreme example we get this: #In this case there's no labeling error, but this shows that not all faces are centralized idx <- which.max(d.train$nose_tip_x) im <- matrix(data=rev(im.train[idx,]), nrow=96, ncol=96) image(1:96, 1:96, im, col=gray((0:255)/255)) points(96-d.train$nose_tip_x[idx], 96-d.train$nose_tip_y[idx], col="red")
#One of the simplest things to try is to compute the mean of the coordinates of each keypoint in the training set and use that as a prediction for all images colMeans(d.train, na.rm=T)
#To build a submission file we need to apply these computed coordinates to the test instances: p <- matrix(data=colMeans(d.train, na.rm=T), nrow=nrow(d.test), ncol=ncol(d.train), byrow=T) colnames(p) <- names(d.train) predictions <- data.frame(ImageId = 1:nrow(d.test), p) head(predictions)
#The expected submission format has one one keypoint per row, but we can easily get that with the help of the reshape2 library:
library(...
Facebook
TwitterVersion 5 release notes:
Removes support for SPSS and Excel data.Changes the crimes that are stored in each file. There are more files now with fewer crimes per file. The files and their included crimes have been updated below.
Adds in agencies that report 0 months of the year.Adds a column that indicates the number of months reported. This is generated summing up the number of unique months an agency reports data for. Note that this indicates the number of months an agency reported arrests for ANY crime. They may not necessarily report every crime every month. Agencies that did not report a crime with have a value of NA for every arrest column for that crime.Removes data on runaways.
Version 4 release notes:
Changes column names from "poss_coke" and "sale_coke" to "poss_heroin_coke" and "sale_heroin_coke" to clearly indicate that these column includes the sale of heroin as well as similar opiates such as morphine, codeine, and opium. Also changes column names for the narcotic columns to indicate that they are only for synthetic narcotics.
Version 3 release notes:
Add data for 2016.Order rows by year (descending) and ORI.Version 2 release notes:
Fix bug where Philadelphia Police Department had incorrect FIPS county code.
The Arrests by Age, Sex, and Race data is an FBI data set that is part of the annual Uniform Crime Reporting (UCR) Program data. This data contains highly granular data on the number of people arrested for a variety of crimes (see below for a full list of included crimes). The data sets here combine data from the years 1980-2015 into a single file. These files are quite large and may take some time to load.
All the data was downloaded from NACJD as ASCII+SPSS Setup files and read into R using the package asciiSetupReader. All work to clean the data and save it in various file formats was also done in R. For the R code used to clean this data, see here. https://github.com/jacobkap/crime_data. If you have any questions, comments, or suggestions please contact me at jkkaplan6@gmail.com.
I did not make any changes to the data other than the following. When an arrest column has a value of "None/not reported", I change that value to zero. This makes the (possible incorrect) assumption that these values represent zero crimes reported. The original data does not have a value when the agency reports zero arrests other than "None/not reported." In other words, this data does not differentiate between real zeros and missing values. Some agencies also incorrectly report the following numbers of arrests which I change to NA: 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000, 99999, 99998.
To reduce file size and make the data more manageable, all of the data is aggregated yearly. All of the data is in agency-year units such that every row indicates an agency in a given year. Columns are crime-arrest category units. For example, If you choose the data set that includes murder, you would have rows for each agency-year and columns with the number of people arrests for murder. The ASR data breaks down arrests by age and gender (e.g. Male aged 15, Male aged 18). They also provide the number of adults or juveniles arrested by race. Because most agencies and years do not report the arrestee's ethnicity (Hispanic or not Hispanic) or juvenile outcomes (e.g. referred to adult court, referred to welfare agency), I do not include these columns.
To make it easier to merge with other data, I merged this data with the Law Enforcement Agency Identifiers Crosswalk (LEAIC) data. The data from the LEAIC add FIPS (state, county, and place) and agency type/subtype. Please note that some of the FIPS codes have leading zeros and if you open it in Excel it will automatically delete those leading zeros.
I created 9 arrest categories myself. The categories are:
Total Male JuvenileTotal Female JuvenileTotal Male AdultTotal Female AdultTotal MaleTotal FemaleTotal JuvenileTotal AdultTotal ArrestsAll of these categories are based on the sums of the sex-age categories (e.g. Male under 10, Female aged 22) rather than using the provided age-race categories (e.g. adult Black, juvenile Asian). As not all agencies report the race data, my method is more accurate. These categories also make up the data in the "simple" version of the data. The "simple" file only includes the above 9 columns as the arrest data (all other columns in the data are just agency identifier columns). Because this "simple" data set need fewer columns, I include all offenses.
As the arrest data is very granular, and each category of arrest is its own column, there are dozens of columns per crime. To keep the data somewhat manageable, there are nine different files, eight which contain different crimes and the "simple" file. Each file contains the data for all years. The eight categories each have crimes belonging to a major crime category and do not overlap in crimes other than with the index offenses. Please note that the crime names provided below are not the same as the column names in the data. Due to Stata limiting column names to 32 characters maximum, I have abbreviated the crime names in the data. The files and their included crimes are:
Index Crimes
MurderRapeRobberyAggravated AssaultBurglaryTheftMotor Vehicle TheftArsonAlcohol CrimesDUIDrunkenness
LiquorDrug CrimesTotal DrugTotal Drug SalesTotal Drug PossessionCannabis PossessionCannabis SalesHeroin or Cocaine PossessionHeroin or Cocaine SalesOther Drug PossessionOther Drug SalesSynthetic Narcotic PossessionSynthetic Narcotic SalesGrey Collar and Property CrimesForgeryFraudStolen PropertyFinancial CrimesEmbezzlementTotal GamblingOther GamblingBookmakingNumbers LotterySex or Family CrimesOffenses Against the Family and Children
Other Sex Offenses
ProstitutionRapeViolent CrimesAggravated AssaultMurderNegligent ManslaughterRobberyWeapon Offenses
Other CrimesCurfewDisorderly ConductOther Non-trafficSuspicion
VandalismVagrancy
Simple
This data set has every crime and only the arrest categories that I created (see above).
If you have any questions, comments, or suggestions please contact me at jkkaplan6@gmail.com.
Facebook
TwitterAttribution 4.0 (CC BY 4.0)https://creativecommons.org/licenses/by/4.0/
License information was derived automatically
This dataset includes ALL the abundance values, zero and non-zero. Taxonomic groups are diplayed in the 'taxon' column, rather than in separate columns, with abundances in the 'abund_L' column. For the original presentation of the data, see VPR_ashjian_orig. For a version of the data with only non-zero data, see VPR_ashjian_nonzero. In the 'nonzero' dataset, values of 0 in the abund_L column (taxon abundance) have been removed.
Methodology
The following information was extracted from C.J. Ashjian et al., Deep- Sea Research II 48(2001) 245-282 . An in-depth discussion of the data and sampling methods can be found there.
The Video Plankton Recorder was towed at 2 m/s, collecting data from the surface to the bottom (towyo). The VPR was equipped with 2-4 cameras, temperature and conductivity probes, fluorometer and transmissometer. Environmental data was collected at 0.25 Hz (CI9407) or 0.5 Hz (EN259, EN262). Video images were recorded at 60 fields per second (fps).
Video tapes were analyzed for plankton abundances using a semi-automated method discussed in Davis, C.S. et al., Deep-Sea Research II 43 (1996) 1946-1970. In-focus images were extracted from the video tapes and identified by hand to particle type, taxon, or species. Plankton and particle observations were merged with environmental and navigational data by binning the observations for each category into the time intervals at which the environmental data were collected (again see above Davis citation). Concentrations were calculated utilizing the total volume (liters) imaged during that period. For less-abundant categories, usually only a single organism was observed during each time interval so that the resulting concentrations are close to presence or absence data rather than covering a range of values.
Facebook
Twitterhttps://doi.org/10.5061/dryad.brv15dvh0
On each trial, participants heard a stimulus and clicked a box on the computer screen to indicate whether they heard "SET" or "SAT." Responses of "SET" are coded as 0 and responses of "SAT" are coded as 1. The continuum steps, from 1-7, for duration and spectral quality cues of the stimulus on each trial are named "DurationStep" and "SpectralStep," respectively. Group (young or older adult) and listening condition (quiet or noise) information are provided for each row of the dataset.
Facebook
TwitterWelcome to the Cyclistic bike-share analysis case study! In this case study, you will perform many real-world tasks of a junior data analyst. You will work for a fictional company, Cyclistic, and meet different characters and team members. In order to answer the key business questions, you will follow the steps of the data analysis process: ask, prepare, process, analyze, share, and act. Along the way, the Case Study Roadmap tables — including guiding questions and key tasks — will help you stay on the right path.
You are a junior data analyst working in the marketing analyst team at Cyclistic, a bike-share company in Chicago. The director of marketing believes the company’s future success depends on maximizing the number of annual memberships. Therefore, your team wants to understand how casual riders and annual members use Cyclistic bikes differently. From these insights, your team will design a new marketing strategy to convert casual riders into annual members. But first, Cyclistic executives must approve your recommendations, so they must be backed up with compelling data insights and professional data visualizations.
How do annual members and casual riders use Cyclistic bikes differently?
What is the problem you are trying to solve?
How do annual members and casual riders use Cyclistic bikes differently?
How can your insights drive business decisions?
The insight will help the marketing team to make a strategy for casual riders
Where is your data located?
Data located in Cyclistic organization data.
How is data organized?
Dataset are in csv format for each month wise from Financial year 22.
Are there issues with bias or credibility in this data? Does your data ROCCC?
It is good it is ROCCC because data collected in from Cyclistic organization.
How are you addressing licensing, privacy, security, and accessibility?
The company has their own license over the dataset. Dataset does not have any personal information about the riders.
How did you verify the data’s integrity?
All the files have consistent columns and each column has the correct type of data.
How does it help you answer your questions?
Insights always hidden in the data. We have the interpret with data to find the insights.
Are there any problems with the data?
Yes, starting station names, ending station names have null values.
What tools are you choosing and why?
I used R studio for the cleaning and transforming the data for analysis phase because of large dataset and to gather experience in the language.
Have you ensured the data’s integrity?
Yes, the data is consistent throughout the columns.
What steps have you taken to ensure that your data is clean?
First duplicates, null values are removed then added new columns for analysis.
How can you verify that your data is clean and ready to analyze?
Make sure the column names are consistent thorough out all data sets by using the “bind row” function.
Make sure column data types are consistent throughout all the dataset by using the “compare_df_col” from the “janitor” package.
Combine the all dataset into single data frame to make consistent throught the analysis.
Removed the column start_lat, start_lng, end_lat, end_lng from the dataframe because those columns not required for analysis.
Create new columns day, date, month, year, from the started_at column this will provide additional opportunities to aggregate the data
Create the “ride_length” column from the started_at and ended_at column to find the average duration of the ride by the riders.
Removed the null rows from the dataset by using the “na.omit function”
Have you documented your cleaning process so you can review and share those results?
Yes, the cleaning process is documented clearly.
How should you organize your data to perform analysis on it? The data has been organized in one single dataframe by using the read csv function in R Has your data been properly formatted? Yes, all the columns have their correct data type.
What surprises did you discover in the data?
Casual member ride duration is higher than the annual members
Causal member widely uses docked bike than the annual members
What trends or relationships did you find in the data?
Annual members are used mainly for commute purpose
Casual member are preferred the docked bikes
Annual members are preferred the electric or classic bikes
How will these insights help answer your business questions?
This insights helps to build a profile for members
Were you able to answer the question of how ...
Facebook
TwitterAttribution 4.0 (CC BY 4.0)https://creativecommons.org/licenses/by/4.0/
License information was derived automatically
Discrete wavelength radiance measurements from the Deep Space Climate Observatory (DSCOVR) Earth Polychromatic Imaging Camera (EPIC) allows derivation of global synoptic maps of total and tropospheric ozone columns every hour during Northern Hemisphere (NH) Summer or 2 hours during Northern Hemisphere winter. In this study, we present version 3 retrieval of Earth Polychromatic Imaging Camera ozone that covers the period from June 2015 to the present with improved geolocation, calibration, and algorithmic updates. The accuracy of total and tropospheric ozone measurements from EPIC have been evaluated using correlative satellite and ground-based total and tropospheric ozone measurements at time scales from daily averages to monthly means. The comparisons show good agreement with increased differences at high latitudes. The agreement improves if we only accept retrievals derived from the EPIC 317 nm triplet and limit solar zenith and satellite looking angles to 70°. With such filtering in place, the comparisons of EPIC total column ozone retrievals with correlative satellite and ground-based data show mean differences within ±5-7 Dobson Units (or 1.5–2.5%). The biases with other satellite instruments tend to be mostly negative in the Southern Hemisphere while there are no clear latitudinal patterns in ground-based comparisons. Evaluation of the EPIC ozone time series at different ground-based stations with the correlative ground-based and satellite instruments and ozonesondes demonstrated good consistency in capturing ozone variations at daily, weekly and monthly scales with a persistently high correlation (r2 > 0.9) for total and tropospheric columns. We examined EPIC tropospheric ozone columns by comparing with ozonesondes at 12 stations and found that differences in tropospheric column ozone are within ±2.5 DU (or ∼±10%) after removing a constant 3 DU offset at all stations between EPIC and sondes. The analysis of the time series of zonally averaged EPIC tropospheric ozone revealed a statistically significant drop of ∼2–4 DU (∼5–10%) over the entire NH in spring and summer of 2020. This drop in tropospheric ozone is partially related to the unprecedented Arctic stratospheric ozone losses in winter-spring 2019/2020 and reductions in ozone precursor pollutants due to the COVID-19 pandemic.
Facebook
TwitterAttribution 4.0 (CC BY 4.0)https://creativecommons.org/licenses/by/4.0/
License information was derived automatically
Column A: Binary classification of data based on laboratory values from Column B (cut-off value = 0: 0 = 0, > 0 = 1); Column B: Laboratory values; Column C: Randomized patient numbers; Columns E–H: Wavelengths with corresponding spectral data. (XLSX)
Facebook
Twitterhttps://creativecommons.org/publicdomain/zero/1.0/https://creativecommons.org/publicdomain/zero/1.0/
Facebook
Twitterhttps://creativecommons.org/publicdomain/zero/1.0/https://creativecommons.org/publicdomain/zero/1.0/
Facebook
TwitterAttribution 4.0 (CC BY 4.0)https://creativecommons.org/licenses/by/4.0/
License information was derived automatically
This respository includes two datasets, a Document-Term Matrix and associated metadata, for 17,493 New York Times articles covering protest events, both saved as single R objects.
These datasets are based on the original Dynamics of Collective Action (DoCA) dataset (Wang and Soule 2012; Earl, Soule, and McCarthy). The original DoCA datset contains variables for protest events referenced in roughly 19,676 New York Times articles reporting on collective action events occurring in the US between 1960 and 1995. Data were collected as part of the Dynamics of Collective Action Project at Stanford University. Research assistants read every page of all daily issues of the New York Times to find descriptions of 23,624 distinct protest events. The text for the news articles were not included in the original DoCA data.
We attempted to recollect the raw text in a semi-supervised fashion by matching article titles to create the Dynamics of Collective Action Corpus. In addition to hand-checking random samples and hand-collecting some articles (specifically, in the case of false positives), we also used some automated matching processes to ensure the recollected article titles matched their respective titles in the DoCA dataset. The final number of recollected and matched articles is 17,493.
We then subset the original DoCA dataset to include only rows that match a recollected article. The "20231006_dca_metadata_subset.Rds" contains all of the metadata variables from the original DoCA dataset (see Codebook), with the addition of "pdf_file" and "pub_title" which is the title of the recollected article (and may differ from the "title" variable in the original dataset), for a total of 106 variables and 21,126 rows (noting that a row is a distinct protest events and one article may cover more than one protest event).
Once collected, we prepared these texts using typical preprocessing procedures (and some less typical procedures, which were necessary given that these were OCRed texts). We followed these steps in this order: We removed headers and footers that were consistent across all digitized stories and any web links or HTML; added a single space before an uppercase letter when it was flush against a lowercase letter to its right (e.g., turning "JohnKennedy'' into "John Kennedy''); removed excess whitespace; converted all characters to the broadest range of Latin characters and then transliterated to ``Basic Latin'' ASCII characters; replaced curly quotes with their ASCII counterparts; replaced contractions (e.g., turned "it's'' into "it is''); removed punctuation; removed capitalization; removed numbers; fixed word kerning; applied a final extra round of whitespace removal.
We then tokenized them by following the rule that each word is a character string surrounded by a single space. At this step, each document is then a list of tokens. We count each unique token to create a document-term matrix (DTM), where each row is an article, each column is a unique token (occurring at least once in the corpus as a whole), and each cell is the number of times each token occurred in each article. Finally, we removed words (i.e., columns in the DTM) that occurred less than four times in the corpus as a whole or were only a single character in length (likely orphaned characters from the OCRing process). The final DTM has 66,552 unique words, 10,134,304 total tokens and 17,493. The "20231006_dca_dtm.Rds" is a sparse matrix class object from the Matrix R package.
In R, use the load() function to load the objects dca_dtm and dca_meta. To associate the dca_meta to the dca_dtm , match the "pdf_file" variable indca_meta to the rownames of dca_dtm.
Facebook
TwitterAttribution-NonCommercial-ShareAlike 4.0 (CC BY-NC-SA 4.0)https://creativecommons.org/licenses/by-nc-sa/4.0/
License information was derived automatically
The original source for the data is the online publisher Top Wine SA. The are well-established experts and collect data from various expert sources within the South African Wine market and keep track of the best wines based on local as well as international judges. This data was published in 2023.
Top Wine SA – Top SA Wine Ratings https://topwinesa.com/top-sa-wines-and-cellars/top-sa-wine-ratings/
The original dataset provides the following description regarding categories:
“Prices are given in South African rand (R) per standard bottle size (750ml), unless stated otherwise, and are inclusive of VAT – prices as supplied by the producers, or approximate retail prices in South Africa if the producers are closed to the public – top wines may well be ‘sold out’ or increase in price after a stellar rating.
Note: AW = Auction Wine, EXP = Export Only, MC = Museum Class (typically ‘sold out’ at the winery, assessed to gauge development or longevity), and OL = Own Label (exclusive to particular retailer(s), not available from the producer).”
I removed these categories because my focus is on what the average South African consumer is paying for.
Wine that is export only is automatically excluded from this this case. As is Museum Class and Auction Wine. Although some Own Label wines were not too difficult to find, I was not able to find all of them so for this reason I have excluded these.
Below I have listed how many in each of these particular categories there were, before removing them.
• AW: 8 • EX: 21 • MC: 4 • OL: 16
The Process I manually copied the data from the TOP SA wines website at this address: https://topwinesa.com/top-sa-wines-and-cellars/top-sa-wine-ratings/
I added in an extra column for Category and Second Category. And assigned the relevant category to each one.
I cleaned up the data which involved sorting and deleting unnecessary rows. There were blank cells in some columns as some wines had more than one award. I removed the any additional awards as the focus of this data set is that there are the wines, vintages, and prices that consumers are paying for, for award-winning South Africa wine (as opposed to the particular award that won).
Then I deleted all entries of the categories mentioned above (AW, EX, MC, OL).
The prices were entered as text with the letter “R”. I used a formula in excel to remove the letter “R” from the price column so that the price could be stored as a number.
=IF(ISNUMBER(VALUE(SUBSTITUTE(B2,"R ",""))), VALUE(SUBSTITUTE(B2,"R ","")), "")
I wanted to have the vintage as its own column. So I used a formula to extract the numbers from the wine names provided. For most of the entries, the year was the only number in the name. For those entries that had additional numbers in the name, I manually fixed those cases.
=IFERROR(VALUE(TEXTJOIN("", TRUE, IF(ISNUMBER(--MID(A2, ROW(INDIRECT("1:" & LEN(A2))), 1)), MID(A2, ROW(INDIRECT("1:" & LEN(A2))), 1), ""))), "")
Not seeing a result you expected?
Learn how you can add new datasets to our index.
Facebook
Twitterhttp://opendatacommons.org/licenses/dbcl/1.0/http://opendatacommons.org/licenses/dbcl/1.0/
#https://www.kaggle.com/c/facial-keypoints-detection/details/getting-started-with-r #################################
###Variables for downloaded files data.dir <- ' ' train.file <- paste0(data.dir, 'training.csv') test.file <- paste0(data.dir, 'test.csv') #################################
###Load csv -- creates a data.frame matrix where each column can have a different type. d.train <- read.csv(train.file, stringsAsFactors = F) d.test <- read.csv(test.file, stringsAsFactors = F)
###In training.csv, we have 7049 rows, each one with 31 columns. ###The first 30 columns are keypoint locations, which R correctly identified as numbers. ###The last one is a string representation of the image, identified as a string.
###To look at samples of the data, uncomment this line:
###Let's save the first column as another variable, and remove it from d.train: ###d.train is our dataframe, and we want the column called Image. ###Assigning NULL to a column removes it from the dataframe
im.train <- d.train$Image d.train$Image <- NULL #removes 'image' from the dataframe
im.test <- d.test$Image d.test$Image <- NULL #removes 'image' from the dataframe
################################# #The image is represented as a series of numbers, stored as a string #Convert these strings to integers by splitting them and converting the result to integer
#strsplit splits the string #unlist simplifies its output to a vector of strings #as.integer converts it to a vector of integers. as.integer(unlist(strsplit(im.train[1], " "))) as.integer(unlist(strsplit(im.test[1], " ")))
###Install and activate appropriate libraries ###The tutorial is meant for Linux and OSx, where they use a different library, so: ###Replace all instances of %dopar% with %do%.
library("foreach", lib.loc="~/R/win-library/3.3")
###implement parallelization im.train <- foreach(im = im.train, .combine=rbind) %do% { as.integer(unlist(strsplit(im, " "))) } im.test <- foreach(im = im.test, .combine=rbind) %do% { as.integer(unlist(strsplit(im, " "))) } #The foreach loop will evaluate the inner command for each row in im.train, and combine the results with rbind (combine by rows). #%do% instructs R to do all evaluations in parallel. #im.train is now a matrix with 7049 rows (one for each image) and 9216 columns (one for each pixel):
###Save all four variables in data.Rd file ###Can reload them at anytime with load('data.Rd')
#each image is a vector of 96*96 pixels (96*96 = 9216). #convert these 9216 integers into a 96x96 matrix: im <- matrix(data=rev(im.train[1,]), nrow=96, ncol=96)
#im.train[1,] returns the first row of im.train, which corresponds to the first training image. #rev reverse the resulting vector to match the interpretation of R's image function #(which expects the origin to be in the lower left corner).
#To visualize the image we use R's image function: image(1:96, 1:96, im, col=gray((0:255)/255))
#Let’s color the coordinates for the eyes and nose points(96-d.train$nose_tip_x[1], 96-d.train$nose_tip_y[1], col="red") points(96-d.train$left_eye_center_x[1], 96-d.train$left_eye_center_y[1], col="blue") points(96-d.train$right_eye_center_x[1], 96-d.train$right_eye_center_y[1], col="green")
#Another good check is to see how variable is our data. #For example, where are the centers of each nose in the 7049 images? (this takes a while to run): for(i in 1:nrow(d.train)) { points(96-d.train$nose_tip_x[i], 96-d.train$nose_tip_y[i], col="red") }
#there are quite a few outliers -- they could be labeling errors. Looking at one extreme example we get this: #In this case there's no labeling error, but this shows that not all faces are centralized idx <- which.max(d.train$nose_tip_x) im <- matrix(data=rev(im.train[idx,]), nrow=96, ncol=96) image(1:96, 1:96, im, col=gray((0:255)/255)) points(96-d.train$nose_tip_x[idx], 96-d.train$nose_tip_y[idx], col="red")
#One of the simplest things to try is to compute the mean of the coordinates of each keypoint in the training set and use that as a prediction for all images colMeans(d.train, na.rm=T)
#To build a submission file we need to apply these computed coordinates to the test instances: p <- matrix(data=colMeans(d.train, na.rm=T), nrow=nrow(d.test), ncol=ncol(d.train), byrow=T) colnames(p) <- names(d.train) predictions <- data.frame(ImageId = 1:nrow(d.test), p) head(predictions)
#The expected submission format has one one keypoint per row, but we can easily get that with the help of the reshape2 library:
library(...