Big data configuration

2022-02-12 09:35:54 +01:00 · 2022-02-12 09:35:54 +01:00 · 72df315b4d
commit 72df315b4d
parent 43f698a329
10 changed files with 634 additions and 2 deletions
--- a/nixos/thor.nix
+++ b/nixos/thor.nix
@ -3,6 +3,46 @@
 # and in the NixOS manual (accessible by running ‘nixos-help’).

 { config, pkgs, lib, ... }:
+
+let sparkConfDir = pkgs.stdenv.mkDerivation {
+  name = "spark-config";
+  dontUnpack = true;
+  installPhase = ''
+    # source standard environment
+    . $stdenv/setup
+
+    # shorthands
+    base_conf=${pkgs.spark}/lib/${pkgs.spark.untarDir}/conf/
+
+    # create output dirs for new derivation
+    mkdir -p $out/
+
+    # link unchanged files from the original gnome-session
+    for f in $base_conf/*.template ; do
+       ln -sf $f $out/
+    done
+
+    # change selected files
+    cp $out/log4j.properties{.template,}
+
+    cat > $out/spark-env.sh <<- STOP
+    export JAVA_HOME="${pkgs.jdk8}"
+    export SPARK_HOME="${pkgs.spark}/lib/${pkgs.spark.untarDir}"
+    export SPARK_DIST_CLASSPATH=$(${pkgs.hadoop}/bin/hadoop classpath)
+    export PYSPARK_PYTHON="${pkgs.python3Packages.python}/bin/${pkgs.python3Packages.python.executable}"
+    export PYTHONPATH="\$PYTHONPATH:$PYTHONPATH"
+    export SPARKR_R_SHELL="${pkgs.R}/bin/R"
+    export PATH="\$PATH:${pkgs.R}/bin"
+    STOP
+
+    cat > $out/spark-defaults.conf <<- STOP
+    spark.eventLog.enabled          		true
+    spark.eventLog.dir              		hdfs://localhost:/logs/spark
+    spark.history.fs.logDirectory   		hdfs://localhost:/logs/spark
+    STOP
+  '';
+};
+in
 {
  imports = [
    <nixos-hardware/common/cpu/amd>
@ -102,7 +142,7 @@
      };
    };
    clamav = { daemon.enable = true; updater.enable = true; };
-    dbus.packages = with pkgs; [ gnome.dconf ];
+    dbus.packages = with pkgs; [ pkgs.dconf ];
    gnome.gnome-keyring.enable = true;
    gvfs = { enable = true; package = pkgs.gnome3.gvfs; };
    fwupd.enable = true;
@ -154,11 +194,104 @@
    };
    # gnome.gnome-remote-desktop.enable = true;
    zerotierone = { enable = true; joinNetworks = [ "8056c2e21cf9c753" ]; };
+
+
+
+    spark = {
+      master = {
+        enable = true;
+        restartIfChanged = true;
+      };
+      worker = {
+        enable = true;
+        restartIfChanged = true;
+      };
+      confDir = sparkConfDir;
+    };
+
+    hadoop = {
+      coreSite = {
+        "fs.defaultFS" = "hdfs://localhost:8020";
+      };
+      hdfsSite = {
+        "dfs.namenode.rpc-bind-host" = "0.0.0.0";
+        "dfs.permissions" = "false";
+      };
+
+      hdfs = {
+        namenode = {
+          enable = true;
+          formatOnInit = true;
+          restartIfChanged = true;
+        };
+        datanode = {
+          enable = true;
+          restartIfChanged = true;
+        };
+        journalnode = {
+          enable = true;
+          restartIfChanged = true;
+        };
+        zkfc = {
+          enable = true;
+          restartIfChanged = true;
+        };
+        httpfs = {
+          enable = true;
+          restartIfChanged = true;
+        };
+      };
+      yarn = {
+        resourcemanager.enable = true;
+        nodemanager.enable = true;
+      };
+    };
+
+
+    ethminer = {
+      enable = false;
+      wallet = "0x73b788882e1C182123333f42FFf275B7dd7f51bb";
+      toolkit = "opencl";
+      rig = "thor";
+      pool = "eth-eu1.nanopool.org";
+      stratumPort = 9999;
+
+      registerMail = "";
+    };
+
+
+    # teamviewer.enable = true;
+  };
+
+  systemd.services.spark-history = {
+    path = with pkgs; [ procps openssh nettools ];
+    description = "spark history service.";
+    after = [ "network.target" ];
+    wantedBy = [ "multi-user.target" ];
+    restartIfChanged = true;
+    environment = {
+      SPARK_CONF_DIR = sparkConfDir;
+      SPARK_LOG_DIR = "/var/log/spark";
+    };
+    serviceConfig = {
+      Type = "forking";
+      User = "spark";
+      Group = "spark";
+      WorkingDirectory = "${pkgs.spark}/lib/${pkgs.spark.untarDir}";
+      ExecStart = "${pkgs.spark}/lib/${pkgs.spark.untarDir}/sbin/start-history-server.sh";
+      ExecStop = "${pkgs.spark}/lib/${pkgs.spark.untarDir}/sbin/stop-history-server.sh";
+      TimeoutSec = 300;
+      StartLimitBurst = 10;
+      Restart = "always";
+    };
  };

  services.teamviewer.enable = true;

  security = {
+    pam.services."kde" = {
+      enableKwallet = true;
+    };
    rtkit.enable = true;
    sudo.extraConfig = ''
      Defaults pwfeedback
@ -187,7 +320,8 @@
    allowUnfree = true;
    packageOverrides = pkgs: {
      steam = pkgs.steam.override {
-        extraPkgs = pkgs: with pkgs; [ icu ];
+        extraPkgs = pkgs: with pkgs; [ ];
+        extraLibraries = pkgs: with pkgs; [ fontconfig.lib icu freetype ];
      };
    };
    # cudaSupport = true;
--- a/spark_conf/fairscheduler.xml.template
+++ b/spark_conf/fairscheduler.xml.template
@ -0,0 +1,31 @@
+<?xml version="1.0"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<allocations>
+  <pool name="production">
+    <schedulingMode>FAIR</schedulingMode>
+    <weight>1</weight>
+    <minShare>2</minShare>
+  </pool>
+  <pool name="test">
+    <schedulingMode>FIFO</schedulingMode>
+    <weight>2</weight>
+    <minShare>3</minShare>
+  </pool>
+</allocations>
--- a/spark_conf/log4j.properties
+++ b/spark_conf/log4j.properties
@ -0,0 +1,46 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Set everything to be logged to the console
+log4j.rootCategory=INFO, console
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
+
+# Set the default spark-shell log level to WARN. When running the spark-shell, the
+# log level for this class is used to overwrite the root logger's log level, so that
+# the user can have different defaults for the shell and regular Spark apps.
+log4j.logger.org.apache.spark.repl.Main=WARN
+
+# Settings to quiet third party logs that are too verbose
+log4j.logger.org.sparkproject.jetty=WARN
+log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR
+log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
+log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
+log4j.logger.org.apache.parquet=ERROR
+log4j.logger.parquet=ERROR
+
+# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
+log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
+log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
+
+# For deploying Spark ThriftServer
+# SPARK-34128：Suppress undesirable TTransportException warnings involved in THRIFT-4805
+log4j.appender.console.filter.1=org.apache.log4j.varia.StringMatchFilter
+log4j.appender.console.filter.1.StringToMatch=Thrift error occurred during processing of message
+log4j.appender.console.filter.1.AcceptOnMatch=false
--- a/spark_conf/log4j.properties.template
+++ b/spark_conf/log4j.properties.template
@ -0,0 +1,46 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Set everything to be logged to the console
+log4j.rootCategory=INFO, console
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
+
+# Set the default spark-shell log level to WARN. When running the spark-shell, the
+# log level for this class is used to overwrite the root logger's log level, so that
+# the user can have different defaults for the shell and regular Spark apps.
+log4j.logger.org.apache.spark.repl.Main=WARN
+
+# Settings to quiet third party logs that are too verbose
+log4j.logger.org.sparkproject.jetty=WARN
+log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR
+log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
+log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
+log4j.logger.org.apache.parquet=ERROR
+log4j.logger.parquet=ERROR
+
+# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
+log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
+log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
+
+# For deploying Spark ThriftServer
+# SPARK-34128：Suppress undesirable TTransportException warnings involved in THRIFT-4805
+log4j.appender.console.filter.1=org.apache.log4j.varia.StringMatchFilter
+log4j.appender.console.filter.1.StringToMatch=Thrift error occurred during processing of message
+log4j.appender.console.filter.1.AcceptOnMatch=false
--- a/spark_conf/metrics.properties.template
+++ b/spark_conf/metrics.properties.template
@ -0,0 +1,210 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#  syntax: [instance].sink|source.[name].[options]=[value]
+
+#  This file configures Spark's internal metrics system. The metrics system is
+#  divided into instances which correspond to internal components.
+#  Each instance can be configured to report its metrics to one or more sinks.
+#  Accepted values for [instance] are "master", "worker", "executor", "driver",
+#  and "applications". A wildcard "*" can be used as an instance name, in
+#  which case all instances will inherit the supplied property.
+#
+#  Within an instance, a "source" specifies a particular set of grouped metrics.
+#  there are two kinds of sources:
+#    1. Spark internal sources, like MasterSource, WorkerSource, etc, which will
+#    collect a Spark component's internal state. Each instance is paired with a
+#    Spark source that is added automatically.
+#    2. Common sources, like JvmSource, which will collect low level state.
+#    These can be added through configuration options and are then loaded
+#    using reflection.
+#
+#  A "sink" specifies where metrics are delivered to. Each instance can be
+#  assigned one or more sinks.
+#
+#  The sink|source field specifies whether the property relates to a sink or
+#  source.
+#
+#  The [name] field specifies the name of source or sink.
+#
+#  The [options] field is the specific property of this source or sink. The
+#  source or sink is responsible for parsing this property.
+#
+#  Notes:
+#    1. To add a new sink, set the "class" option to a fully qualified class
+#    name (see examples below).
+#    2. Some sinks involve a polling period. The minimum allowed polling period
+#    is 1 second.
+#    3. Wildcard properties can be overridden by more specific properties.
+#    For example, master.sink.console.period takes precedence over
+#    *.sink.console.period.
+#    4. A metrics specific configuration
+#    "spark.metrics.conf=${SPARK_HOME}/conf/metrics.properties" should be
+#    added to Java properties using -Dspark.metrics.conf=xxx if you want to
+#    customize metrics system. You can also put the file in ${SPARK_HOME}/conf
+#    and it will be loaded automatically.
+#    5. The MetricsServlet sink is added by default as a sink in the master,
+#    worker and driver, and you can send HTTP requests to the "/metrics/json"
+#    endpoint to get a snapshot of all the registered metrics in JSON format.
+#    For master, requests to the "/metrics/master/json" and
+#    "/metrics/applications/json" endpoints can be sent separately to get
+#    metrics snapshots of the master instance and applications. This
+#    MetricsServlet does not have to be configured.
+#    6. The metrics system can also be configured using Spark configuration
+#    parameters. The relevant parameter names are formed by adding the
+#    prefix "spark.metrics.conf." to the configuration entries detailed in
+#    this file (see examples below).
+
+## List of available common sources and their properties.
+
+# org.apache.spark.metrics.source.JvmSource
+#   Note: Currently, JvmSource is the only available common source.
+#         It can be added to an instance by setting the "class" option to its
+#         fully qualified class name (see examples below).
+
+## List of available sinks and their properties.
+
+# org.apache.spark.metrics.sink.ConsoleSink
+#   Name:   Default:   Description:
+#   period  10         Poll period
+#   unit    seconds    Unit of the poll period
+
+# org.apache.spark.metrics.sink.CSVSink
+#   Name:     Default:   Description:
+#   period    10         Poll period
+#   unit      seconds    Unit of the poll period
+#   directory /tmp       Where to store CSV files
+
+# org.apache.spark.metrics.sink.GangliaSink
+#   Name:     Default:   Description:
+#   host      NONE       Hostname or multicast group of the Ganglia server,
+#                        must be set
+#   port      NONE       Port of the Ganglia server(s), must be set
+#   period    10         Poll period
+#   unit      seconds    Unit of the poll period
+#   ttl       1          TTL of messages sent by Ganglia
+#   dmax      0          Lifetime in seconds of metrics (0 never expired)
+#   mode      multicast  Ganglia network mode ('unicast' or 'multicast')
+
+# org.apache.spark.metrics.sink.JmxSink
+
+# org.apache.spark.metrics.sink.MetricsServlet
+#   Name:     Default:   Description:
+#   path      VARIES*    Path prefix from the web server root
+#   sample    false      Whether to show entire set of samples for histograms
+#                        ('false' or 'true')
+#
+# * Default path is /metrics/json for all instances except the master. The
+#   master has two paths:
+#     /metrics/applications/json # App information
+#     /metrics/master/json       # Master information
+
+# org.apache.spark.metrics.sink.PrometheusServlet
+#   Name:     Default:   Description:
+#   path      VARIES*    Path prefix from the web server root
+#
+# * Default path is /metrics/prometheus for all instances except the master. The
+#   master has two paths:
+#     /metrics/applications/prometheus # App information
+#     /metrics/master/prometheus       # Master information
+
+# org.apache.spark.metrics.sink.GraphiteSink
+#   Name:     Default:      Description:
+#   host      NONE          Hostname of the Graphite server, must be set
+#   port      NONE          Port of the Graphite server, must be set
+#   period    10            Poll period
+#   unit      seconds       Unit of the poll period
+#   prefix    EMPTY STRING  Prefix to prepend to every metric's name
+#   protocol  tcp           Protocol ("tcp" or "udp") to use
+#   regex     NONE          Optional filter to send only metrics matching this regex string
+
+# org.apache.spark.metrics.sink.StatsdSink
+#   Name:     Default:      Description:
+#   host      127.0.0.1     Hostname or IP of StatsD server
+#   port      8125          Port of StatsD server
+#   period    10            Poll period
+#   unit      seconds       Units of poll period
+#   prefix    EMPTY STRING  Prefix to prepend to metric name
+
+## Examples
+# Enable JmxSink for all instances by class name
+#*.sink.jmx.class=org.apache.spark.metrics.sink.JmxSink
+
+# Enable ConsoleSink for all instances by class name
+#*.sink.console.class=org.apache.spark.metrics.sink.ConsoleSink
+
+# Enable StatsdSink for all instances by class name
+#*.sink.statsd.class=org.apache.spark.metrics.sink.StatsdSink
+#*.sink.statsd.prefix=spark
+
+# Polling period for the ConsoleSink
+#*.sink.console.period=10
+# Unit of the polling period for the ConsoleSink
+#*.sink.console.unit=seconds
+
+# Polling period for the ConsoleSink specific for the master instance
+#master.sink.console.period=15
+# Unit of the polling period for the ConsoleSink specific for the master
+# instance
+#master.sink.console.unit=seconds
+
+# Enable CsvSink for all instances by class name
+#*.sink.csv.class=org.apache.spark.metrics.sink.CsvSink
+
+# Polling period for the CsvSink
+#*.sink.csv.period=1
+# Unit of the polling period for the CsvSink
+#*.sink.csv.unit=minutes
+
+# Polling directory for CsvSink
+#*.sink.csv.directory=/tmp/
+
+# Polling period for the CsvSink specific for the worker instance
+#worker.sink.csv.period=10
+# Unit of the polling period for the CsvSink specific for the worker instance
+#worker.sink.csv.unit=minutes
+
+# Enable Slf4jSink for all instances by class name
+#*.sink.slf4j.class=org.apache.spark.metrics.sink.Slf4jSink
+
+# Polling period for the Slf4JSink
+#*.sink.slf4j.period=1
+# Unit of the polling period for the Slf4jSink
+#*.sink.slf4j.unit=minutes
+
+# Example configuration for Graphite sink
+#*.sink.graphite.class=org.apache.spark.metrics.sink.GraphiteSink
+#*.sink.graphite.host=<graphiteEndPoint_hostName>
+#*.sink.graphite.port=<listening_port>
+#*.sink.graphite.period=10
+#*.sink.graphite.unit=seconds
+#*.sink.graphite.prefix=<optional_value>
+
+# Enable JvmSource for instance master, worker, driver and executor
+#master.source.jvm.class=org.apache.spark.metrics.source.JvmSource
+
+#worker.source.jvm.class=org.apache.spark.metrics.source.JvmSource
+
+#driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource
+
+#executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource
+
+# Example configuration for PrometheusServlet
+#*.sink.prometheusServlet.class=org.apache.spark.metrics.sink.PrometheusServlet
+#*.sink.prometheusServlet.path=/metrics/prometheus
+#master.sink.prometheusServlet.path=/metrics/master/prometheus
+#applications.sink.prometheusServlet.path=/metrics/applications/prometheus
--- a/spark_conf/spark-defaults.conf
+++ b/spark_conf/spark-defaults.conf
@ -0,0 +1,39 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Default system properties included when running spark-submit.
+# This is useful for setting default environmental settings.
+
+# Example:
+# spark.master                     spark://master:7077
+# spark.eventLog.enabled           true
+# spark.eventLog.dir               hdfs://namenode:8021/directory
+# spark.serializer                 org.apache.spark.serializer.KryoSerializer
+# spark.driver.memory              5g
+# spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
+
+# spark.io.compression.codec	  lzf
+# spark.io.compression.codec	  org.apache.spark.io.SnappyCompressionCodec
+# spark.eventLog.compress			  false
+
+spark.eventLog.enabled          		true
+spark.eventLog.dir              		hdfs://localhost:/logs/spark
+spark.history.fs.logDirectory   		hdfs://localhost:/logs/spark
+
+
+#
+#
--- a/spark_conf/spark-defaults.conf.template
+++ b/spark_conf/spark-defaults.conf.template
@ -0,0 +1,27 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Default system properties included when running spark-submit.
+# This is useful for setting default environmental settings.
+
+# Example:
+# spark.master                     spark://master:7077
+# spark.eventLog.enabled           true
+# spark.eventLog.dir               hdfs://namenode:8021/directory
+# spark.serializer                 org.apache.spark.serializer.KryoSerializer
+# spark.driver.memory              5g
+# spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
--- a/spark_conf/spark-env.sh
+++ b/spark_conf/spark-env.sh
@ -0,0 +1,7 @@
+export JAVA_HOME="/nix/store/d0akdmr675jrlabv7n8syg8yrg1zlyxz-openjdk-8u272-b10"
+export SPARK_HOME="/nix/store/zhj5q1pi0bs2lpc0lbkw8qkg03ywx9b8-spark-3.1.2/lib/spark-3.1.2"
+export SPARK_DIST_CLASSPATH=/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/common/lib/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/common/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/hdfs:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/hdfs/lib/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/hdfs/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/mapreduce/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/yarn:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/yarn/lib/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/yarn/*
+export PYSPARK_PYTHON="/nix/store/2c9w4p2x6x0l64fdvcmc11app7x4xran-python3-3.9.6/bin/python3.9"
+export PYTHONPATH="$PYTHONPATH:/nix/store/2c9w4p2x6x0l64fdvcmc11app7x4xran-python3-3.9.6/lib/python3.9/site-packages"
+export SPARKR_R_SHELL="/nix/store/h1s3y5jjrwdm5gd2qyxp2ldsnykippcb-R-4.1.2/bin/R"
+export PATH="$PATH:/nix/store/h1s3y5jjrwdm5gd2qyxp2ldsnykippcb-R-4.1.2/bin"
--- a/spark_conf/spark-env.sh.template
+++ b/spark_conf/spark-env.sh.template
@ -0,0 +1,73 @@
+#!/nix/store/vfai0jim0db67nk9rd7ziq29jxb5n79n-bash-5.1-p8/bin/bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This file is sourced when running various Spark programs.
+# Copy it as spark-env.sh and edit that to configure Spark for your site.
+
+# Options read when launching programs locally with
+# ./bin/run-example or ./bin/spark-submit
+# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
+# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
+# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program
+
+# Options read by executors and drivers running inside the cluster
+# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
+# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program
+# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data
+# - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos
+
+# Options read in YARN client/cluster mode
+# - SPARK_CONF_DIR, Alternate conf dir. (Default: ${SPARK_HOME}/conf)
+# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
+# - YARN_CONF_DIR, to point Spark towards YARN configuration files when you use YARN
+# - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1).
+# - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G)
+# - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G)
+
+# Options for the daemons used in the standalone deploy mode
+# - SPARK_MASTER_HOST, to bind the master to a different IP address or hostname
+# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master
+# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y")
+# - SPARK_WORKER_CORES, to set the number of cores to use on this machine
+# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g)
+# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker
+# - SPARK_WORKER_DIR, to set the working directory of worker processes
+# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y")
+# - SPARK_DAEMON_MEMORY, to allocate to the master, worker and history server themselves (default: 1g).
+# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y")
+# - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y")
+# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y")
+# - SPARK_DAEMON_CLASSPATH, to set the classpath for all daemons
+# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers
+
+# Options for launcher
+# - SPARK_LAUNCHER_OPTS, to set config properties and Java options for the launcher (e.g. "-Dx=y")
+
+# Generic options for the daemons used in the standalone deploy mode
+# - SPARK_CONF_DIR      Alternate conf dir. (Default: ${SPARK_HOME}/conf)
+# - SPARK_LOG_DIR       Where log files are stored.  (Default: ${SPARK_HOME}/logs)
+# - SPARK_LOG_MAX_FILES Max log files of Spark daemons can rotate to. Default is 5.
+# - SPARK_PID_DIR       Where the pid file is stored. (Default: /tmp)
+# - SPARK_IDENT_STRING  A string representing this instance of spark. (Default: $USER)
+# - SPARK_NICENESS      The scheduling priority for daemons. (Default: 0)
+# - SPARK_NO_DAEMONIZE  Run the proposed command in the foreground. It will not output a PID file.
+# Options for native BLAS, like Intel MKL, OpenBLAS, and so on.
+# You might get better performance to enable these options if using native BLAS (see SPARK-21305).
+# - MKL_NUM_THREADS=1        Disable multi-threading of Intel MKL
+# - OPENBLAS_NUM_THREADS=1   Disable multi-threading of OpenBLAS
--- a/spark_conf/workers.template
+++ b/spark_conf/workers.template
@ -0,0 +1,19 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# A Spark Worker will be started on each of the machines listed below.
+localhost