Cloud computing with R and AWS

Page content

Why?

  1. You want to run R code on the cloud.
  2. For whatever reason, you don’t want to use google nor azure.

Credit

I took most of the code from this gist

The code

This function takes a list with your instances, the path to your private key, and returns a cluster object that can be used with the future package. I was told that this function will be part of a new package soon.

aws_cluster <- function(instances, key){
  # Ensure we are running and initialized
  is_running <- vector("logical", length(instances))
  is_initialized <- vector("logical", length(instances))
  
  while(!all(is_initialized)) {
    
    for(ii in seq_along(instances)) {
      
      # Current instance
      i_ii <- instance_status(instances[[ii]])
      
      # Initially, we don't get any information
      if(length(i_ii) == 0) {
        next()
      }
      
      # First check if we are at least running
      if(!is_running[ii]) {
        if(unlist(i_ii$item$instanceState$name) == "running") {
          is_running[ii] <- TRUE
          message("Instance ", ii, " is running. Now initialzing.")
        }
      }
      
      # Then check if we are initialized
      if(!is_initialized[ii]) {
        if(unlist(i_ii$item$instanceStatus$status) == "ok") {
          is_initialized[ii] <- TRUE
          message("Instance ", ii, " is initialized.")
        }
      }
      
    }
    
  }
  
  # Get the public IPs
  public_ip <- vapply(
    instances,
    function(i_ii) {
      i_di <- describe_instances(i_ii)
      i_di[[1]]$instancesSet[[1]]$networkInterfaceSet$privateIpAddressesSet$association$publicIp
    },
    FUN.VALUE = character(1)
  )
  
  # Connect!
  cl <- makeClusterPSOCK(
    
    ## Public IP number of EC2 instance
    public_ip,
    
    ## User name (always 'ubuntu')
    user = "ubuntu",
    
    ## Use private SSH key registered with AWS
    rshopts = c(
      "-o", "StrictHostKeyChecking=no",
      "-o", "IdentitiesOnly=yes",
      "-i", key
    ),
    
    ## Set up .libPaths() for the 'ubuntu' user and
    ## install future/purrr/furrr packages
    rscript_args = c(
      "-e", shQuote("local({p <- Sys.getenv('R_LIBS_USER'); dir.create(p, recursive = TRUE, showWarnings = FALSE); .libPaths(p)})"),
      "-e", shQuote("install.packages(c('future', 'purrr', 'furrr'))")
    ),
    
    dryrun = FALSE
  )
  return(cl)
}

Load the relevant packages

# devtools::install_github("cloudyr/aws.ec2", ref = devtools::github_pull(38))
library(aws.ec2)
library(future)
library(furrr)
library(tictoc)
aws.signature::use_credentials()

Baseline test

plan(sequential)
tic("baseline")
future_map(1:2, ~Sys.sleep(60))
toc()

baseline: 121.231 sec elapsed

image <- "ami-fd2ffe87"
# Check your VPC and Security Group settings
s <- describe_subnets()
g <- describe_sgroups("sg-16fa225d")
kp <- describe_keypairs("synology") # <- Your keypair here

# Launch the instance using appropriate settings
i <- run_instances(image = image,
                   type = "t2.medium",
                   sgroup = g,
                   subnet = s[[1]],
                   min = 2L, # <- Launching 2 medium instances
                   keypair = kp$synology)

cl <- aws_cluster(instances = i, key = "/home/ignacio/AWS/synology.pem")

############## Now we have a cluster object we can use with future
plan(cluster, workers = cl)
tic("test")
future_map(1:2, ~Sys.sleep(60))
toc()

test: 61.698 sec elapsed

Finally, you can programmatically shutdown the cluster we just created

parallel::stopCluster(cl)
terminate_instances(i)

Video talking about this