Generating random data with R

Some times we need a random data set for Machine Learning or Data Science tests. Here’s a brief collection of random data generation code in R.

The quick way: runif()

runif() generates random numbers: it requieres the number or values to return.

# Just a random number: a float value between 0 and 1
random_value <- runif(1)

# 10 numbers in a list
random_list <- runif( 10 )

# A data frame (column "Data")
random_df <- data.frame( Data=runif(10) )

Setting the value inteval

You can specify the random interval with max and min parameters:

# 10 random numbers between 50 - 100
randoms <- runif( 10, min=50, max=100 )
randoms
  1. 69.5292674936354
  2. 90.2386948582716
  3. 72.3201019689441
  4. 55.271887825802
  5. 93.293529143557
  6. 71.0763963637874
  7. 94.5607772911899
  8. 65.7718844828196
  9. 82.5249472516589
  10. 58.4923235932365

Integers

Just enclose the runif with as.integer():

# Random integers
ages <- data.frame( AGE=as.integer( runif(5, min=1, max=74) ) )
ages
AGE
21
45
55
38
22

Random values on a time interval

The seq() function: you can generate random values for each time entry:

# A time sequence (days) between two dates
date_sequence <- seq( as.Date("2016/12/20"), as.Date("2017/1/5"), "days")

# The data frame
my_data <- data.frame(
  Date = date_sequence,
  Value = runif( length(date_sequence) ) )

my_data
DateValue
2016-12-200.41541707
2016-12-210.68279532
2016-12-220.04138726
2016-12-230.82534459
2016-12-240.12639527
2016-12-250.80020737
2016-12-260.88571625
2016-12-270.22996174
2016-12-280.54546891
2016-12-290.40647596
2016-12-300.73663513
2016-12-310.37449615
2017-01-010.26215537
2017-01-020.07069046
2017-01-030.47572083
2017-01-040.65177759
2017-01-050.43956374

Random date/time intervals

There’s a great response in Stack Overflow by Dirk Eddelbuettel about how to create a random temporal samples:

# This function requiers a number of time entries and a starting/ending time
random_dates <- function(N, start_date="1950/01/01", end_date="2000/12/31") {
  start_date <- as.POSIXct( as.Date(start_date) )
  end_date <- as.POSIXct( as.Date(end_date) )
  diff_time <- as.numeric( difftime(end_date, start_date, unit="sec") )
  elapsed_time <- sort( runif(N, 0, diff_time) )
  start_date + elapsed_time
}

random_dates( 20 )
 [1] "1951-03-22 01:20:09 CET"  "1954-07-29 11:32:06 CET" 
 [3] "1959-05-28 07:27:11 CET"  "1961-08-30 01:17:19 CET" 
 [5] "1962-09-24 10:31:39 CET"  "1964-07-15 05:56:34 CET" 
 [7] "1969-11-20 09:33:35 CET"  "1970-12-24 20:42:15 CET" 
 [9] "1973-02-06 07:39:12 CET"  "1977-04-03 13:02:32 CEST"
[11] "1978-01-15 00:08:36 CET"  "1979-02-11 03:01:08 CET" 
[13] "1979-03-16 23:38:50 CET"  "1982-01-16 21:49:48 CET" 
[15] "1982-09-01 12:33:32 CEST" "1985-07-24 08:06:29 CEST"
[17] "1988-01-21 03:40:17 CET"  "1988-11-16 22:51:04 CET" 
[19] "1993-03-26 10:57:17 CET"  "1997-05-15 09:18:19 CEST"

Random entries from a list

You can build a random selection of elements with sample():

# A list of names and family names
names <- c( "John", "Amy", "Mark", "Miranda", "James", "Mary")
family_names <- c( "Smith", "Jones", "Taylor", "Williams", "Brown" )

# Generate 20 names
people_sample <- data.frame( Name=paste(
    sample( names, 20, replace=TRUE ),
    paste0( sample( LETTERS, 20, replace=TRUE ), "."),
    sample( family_names, 20, replace=TRUE ) ) )

people_sample
Name
Mark O. Jones
John Y. Jones
James O. Jones
John Z. Brown
Amy I. Williams
Mary V. Williams
Mary M. Brown
Amy I. Smith
Mary E. Taylor
John B. Taylor
John V. Williams
Miranda V. Brown
James Z. Smith
Mary Z. Taylor
Amy U. Williams
Miranda Y. Williams
Miranda N. Williams
James Y. Brown
Mary F. Taylor
Amy Y. Taylor

Putting all together: a fake user generator

Now let’s build a random data set.

The generate_users() function will return a data frame with made-up user information.

# A list of names and family names
names <- c( "John", "Amy", "Mark", "Miranda", "James", "Mary")
family_names <- c( "Smith", "Jones", "Taylor", "Williams", "Brown" )

# Fake user generator
generate_users <- function( N, birthdate_start="1950/01/01", birthdate_end="2000/12/31" ) {
    # Users name
    people_sample <- data.frame( Name=paste(
        sample( names, N, replace=TRUE ),
        paste0( sample( LETTERS, N, replace=TRUE ), "."),
        sample( family_names, N, replace=TRUE ) ) )

    # Birth dates
    start_date <- as.POSIXct( as.Date(birthdate_start) )
    end_date <- as.POSIXct( as.Date(birthdate_end) )
    diff_time <- as.numeric( difftime(end_date, start_date, unit="sec") )
    elapsed_time <- sort( runif(N, 0, diff_time) )
    birth_dates <- start_date + elapsed_time
    
    # Height (normal distribution over the "mean")
    heights <- rnorm(N, mean=69.7, sd=2)
    
    # Weight (as a relation with the height)
    weights <- heights * rnorm( N, mean=1.5, sd=0.02 )
    
    # Build the complete data frame
    user_data <- data.frame( Name=people_sample,
                             BirthDate=birth_dates,
                             Height=heights,
                             Weight=weights )
    # Return the data
    user_data
}

# test
users <- generate_users( 20 )

users
NameBirthDateHeightWeight
Mary W. Taylor 1952-07-30 20:00:4970.68220 106.89455
James F. Brown 1952-11-09 02:40:1269.58180 106.38550
James N. Taylor 1956-07-21 14:47:2072.27035 109.25654
James W. Williams 1959-01-23 01:10:5175.84764 115.71886
James I. Brown 1965-06-30 10:34:2068.41527 103.00391
John J. Brown 1968-05-07 23:31:0969.67103 103.06098
Miranda F. Brown 1970-09-05 09:00:4066.05836 96.90736
Miranda W. Jones 1972-05-13 05:44:0668.77517 103.05811
Mary C. Smith 1979-12-07 23:46:2669.00136 105.93778
John S. Brown 1981-04-27 02:48:3369.22812 101.42435
Miranda M. Jones 1981-12-28 18:10:5966.24654 98.45287
Miranda J. Jones 1984-11-02 01:33:3870.65022 104.46331
Mark R. Taylor 1987-01-23 05:59:0468.29989 103.67119
Amy X. Williams 1989-07-02 16:26:2069.34222 105.75731
Amy M. Jones 1991-06-09 07:58:5171.99594 109.21450
Mary B. Taylor 1997-07-11 18:28:1168.09339 100.96932
Miranda B. Taylor 1998-09-23 14:55:3268.04952 102.17367
Amy X. Jones 1999-05-19 15:16:1371.58484 105.74813
Miranda J. Brown 1999-09-02 18:14:0367.60330 100.80424
Amy R. Jones 2000-10-02 03:00:1772.16646 106.96718