library(sparklyr)
find_scalac(version="2.11")
download_scalac(dest_path="/opt/scala")
Sys.setenv(SPARK_HOME="/opt/spark-2.2.0-bin-hadoop2.7")
Sys.setenv(JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64")
spec <- sparklyr::spark_default_compilation_spec()
spec <- Filter(function(e) e$spark_version >= "2.2.0", spec)
sparklyr::compile_package_jars(spec=spec)
R CMD build sparklyudf
install.packages("$HOME/R/mypackages/sparklyudf_0.1.0.tar.gz",repos=NULL,type="source")
library(sparklyudf)
library(sparklyr)
library(dplyr)
sc <- spark_connect(master = "spark-master")
sparklyudf_register(sc)
data.frame(path = "some_data") %>%
copy_to(sc, .) %>%
mutate(file_name = get_only_file_name(path))
Info on: https://github.com/javierluraschi/sparklyudf and on https://spark.rstudio.com/reference/#section-extensions