From 72df315b4de9a5e72c618c8a932c784404a7b644 Mon Sep 17 00:00:00 2001
From: Filippo Berto <berto.f@protonmail.com>
Date: Sat, 12 Feb 2022 09:35:54 +0100
Subject: [PATCH 01/12] Big data configuration

---
 nixos/thor.nix                          | 138 +++++++++++++++-
 spark_conf/fairscheduler.xml.template   |  31 ++++
 spark_conf/log4j.properties             |  46 ++++++
 spark_conf/log4j.properties.template    |  46 ++++++
 spark_conf/metrics.properties.template  | 210 ++++++++++++++++++++++++
 spark_conf/spark-defaults.conf          |  39 +++++
 spark_conf/spark-defaults.conf.template |  27 +++
 spark_conf/spark-env.sh                 |   7 +
 spark_conf/spark-env.sh.template        |  73 ++++++++
 spark_conf/workers.template             |  19 +++
 10 files changed, 634 insertions(+), 2 deletions(-)
 create mode 100644 spark_conf/fairscheduler.xml.template
 create mode 100644 spark_conf/log4j.properties
 create mode 100644 spark_conf/log4j.properties.template
 create mode 100644 spark_conf/metrics.properties.template
 create mode 100644 spark_conf/spark-defaults.conf
 create mode 100644 spark_conf/spark-defaults.conf.template
 create mode 100644 spark_conf/spark-env.sh
 create mode 100755 spark_conf/spark-env.sh.template
 create mode 100644 spark_conf/workers.template

diff --git a/nixos/thor.nix b/nixos/thor.nix
index 6e7eaad..a2a8371 100644
--- a/nixos/thor.nix
+++ b/nixos/thor.nix
@@ -3,6 +3,46 @@
 # and in the NixOS manual (accessible by running ‘nixos-help’).
 
 { config, pkgs, lib, ... }:
+
+let sparkConfDir = pkgs.stdenv.mkDerivation {
+  name = "spark-config";
+  dontUnpack = true;
+  installPhase = ''
+    # source standard environment
+    . $stdenv/setup
+
+    # shorthands
+    base_conf=${pkgs.spark}/lib/${pkgs.spark.untarDir}/conf/
+
+    # create output dirs for new derivation
+    mkdir -p $out/
+
+    # link unchanged files from the original gnome-session
+    for f in $base_conf/*.template ; do
+       ln -sf $f $out/
+    done
+
+    # change selected files
+    cp $out/log4j.properties{.template,}
+
+    cat > $out/spark-env.sh <<- STOP
+    export JAVA_HOME="${pkgs.jdk8}"
+    export SPARK_HOME="${pkgs.spark}/lib/${pkgs.spark.untarDir}"
+    export SPARK_DIST_CLASSPATH=$(${pkgs.hadoop}/bin/hadoop classpath)
+    export PYSPARK_PYTHON="${pkgs.python3Packages.python}/bin/${pkgs.python3Packages.python.executable}"
+    export PYTHONPATH="\$PYTHONPATH:$PYTHONPATH"
+    export SPARKR_R_SHELL="${pkgs.R}/bin/R"
+    export PATH="\$PATH:${pkgs.R}/bin"
+    STOP
+
+    cat > $out/spark-defaults.conf <<- STOP
+    spark.eventLog.enabled          		true
+    spark.eventLog.dir              		hdfs://localhost:/logs/spark
+    spark.history.fs.logDirectory   		hdfs://localhost:/logs/spark
+    STOP
+  '';
+};
+in
 {
   imports = [
     <nixos-hardware/common/cpu/amd>
@@ -102,7 +142,7 @@
       };
     };
     clamav = { daemon.enable = true; updater.enable = true; };
-    dbus.packages = with pkgs; [ gnome.dconf ];
+    dbus.packages = with pkgs; [ pkgs.dconf ];
     gnome.gnome-keyring.enable = true;
     gvfs = { enable = true; package = pkgs.gnome3.gvfs; };
     fwupd.enable = true;
@@ -154,11 +194,104 @@
     };
     # gnome.gnome-remote-desktop.enable = true;
     zerotierone = { enable = true; joinNetworks = [ "8056c2e21cf9c753" ]; };
+
+
+
+    spark = {
+      master = {
+        enable = true;
+        restartIfChanged = true;
+      };
+      worker = {
+        enable = true;
+        restartIfChanged = true;
+      };
+      confDir = sparkConfDir;
+    };
+
+    hadoop = {
+      coreSite = {
+        "fs.defaultFS" = "hdfs://localhost:8020";
+      };
+      hdfsSite = {
+        "dfs.namenode.rpc-bind-host" = "0.0.0.0";
+        "dfs.permissions" = "false";
+      };
+
+      hdfs = {
+        namenode = {
+          enable = true;
+          formatOnInit = true;
+          restartIfChanged = true;
+        };
+        datanode = {
+          enable = true;
+          restartIfChanged = true;
+        };
+        journalnode = {
+          enable = true;
+          restartIfChanged = true;
+        };
+        zkfc = {
+          enable = true;
+          restartIfChanged = true;
+        };
+        httpfs = {
+          enable = true;
+          restartIfChanged = true;
+        };
+      };
+      yarn = {
+        resourcemanager.enable = true;
+        nodemanager.enable = true;
+      };
+    };
+
+
+    ethminer = {
+      enable = false;
+      wallet = "0x73b788882e1C182123333f42FFf275B7dd7f51bb";
+      toolkit = "opencl";
+      rig = "thor";
+      pool = "eth-eu1.nanopool.org";
+      stratumPort = 9999;
+
+      registerMail = "";
+    };
+
+
+    # teamviewer.enable = true;
+  };
+
+  systemd.services.spark-history = {
+    path = with pkgs; [ procps openssh nettools ];
+    description = "spark history service.";
+    after = [ "network.target" ];
+    wantedBy = [ "multi-user.target" ];
+    restartIfChanged = true;
+    environment = {
+      SPARK_CONF_DIR = sparkConfDir;
+      SPARK_LOG_DIR = "/var/log/spark";
+    };
+    serviceConfig = {
+      Type = "forking";
+      User = "spark";
+      Group = "spark";
+      WorkingDirectory = "${pkgs.spark}/lib/${pkgs.spark.untarDir}";
+      ExecStart = "${pkgs.spark}/lib/${pkgs.spark.untarDir}/sbin/start-history-server.sh";
+      ExecStop = "${pkgs.spark}/lib/${pkgs.spark.untarDir}/sbin/stop-history-server.sh";
+      TimeoutSec = 300;
+      StartLimitBurst = 10;
+      Restart = "always";
+    };
   };
 
   services.teamviewer.enable = true;
 
   security = {
+    pam.services."kde" = {
+      enableKwallet = true;
+    };
     rtkit.enable = true;
     sudo.extraConfig = ''
       Defaults pwfeedback
@@ -187,7 +320,8 @@
     allowUnfree = true;
     packageOverrides = pkgs: {
       steam = pkgs.steam.override {
-        extraPkgs = pkgs: with pkgs; [ icu ];
+        extraPkgs = pkgs: with pkgs; [ ];
+        extraLibraries = pkgs: with pkgs; [ fontconfig.lib icu freetype ];
       };
     };
     # cudaSupport = true;
diff --git a/spark_conf/fairscheduler.xml.template b/spark_conf/fairscheduler.xml.template
new file mode 100644
index 0000000..385b2e7
--- /dev/null
+++ b/spark_conf/fairscheduler.xml.template
@@ -0,0 +1,31 @@
+<?xml version="1.0"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<allocations>
+  <pool name="production">
+    <schedulingMode>FAIR</schedulingMode>
+    <weight>1</weight>
+    <minShare>2</minShare>
+  </pool>
+  <pool name="test">
+    <schedulingMode>FIFO</schedulingMode>
+    <weight>2</weight>
+    <minShare>3</minShare>
+  </pool>
+</allocations>
diff --git a/spark_conf/log4j.properties b/spark_conf/log4j.properties
new file mode 100644
index 0000000..dc7b9ea
--- /dev/null
+++ b/spark_conf/log4j.properties
@@ -0,0 +1,46 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Set everything to be logged to the console
+log4j.rootCategory=INFO, console
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
+
+# Set the default spark-shell log level to WARN. When running the spark-shell, the
+# log level for this class is used to overwrite the root logger's log level, so that
+# the user can have different defaults for the shell and regular Spark apps.
+log4j.logger.org.apache.spark.repl.Main=WARN
+
+# Settings to quiet third party logs that are too verbose
+log4j.logger.org.sparkproject.jetty=WARN
+log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR
+log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
+log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
+log4j.logger.org.apache.parquet=ERROR
+log4j.logger.parquet=ERROR
+
+# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
+log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
+log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
+
+# For deploying Spark ThriftServer
+# SPARK-34128：Suppress undesirable TTransportException warnings involved in THRIFT-4805
+log4j.appender.console.filter.1=org.apache.log4j.varia.StringMatchFilter
+log4j.appender.console.filter.1.StringToMatch=Thrift error occurred during processing of message
+log4j.appender.console.filter.1.AcceptOnMatch=false
diff --git a/spark_conf/log4j.properties.template b/spark_conf/log4j.properties.template
new file mode 100644
index 0000000..dc7b9ea
--- /dev/null
+++ b/spark_conf/log4j.properties.template
@@ -0,0 +1,46 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Set everything to be logged to the console
+log4j.rootCategory=INFO, console
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
+
+# Set the default spark-shell log level to WARN. When running the spark-shell, the
+# log level for this class is used to overwrite the root logger's log level, so that
+# the user can have different defaults for the shell and regular Spark apps.
+log4j.logger.org.apache.spark.repl.Main=WARN
+
+# Settings to quiet third party logs that are too verbose
+log4j.logger.org.sparkproject.jetty=WARN
+log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR
+log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
+log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
+log4j.logger.org.apache.parquet=ERROR
+log4j.logger.parquet=ERROR
+
+# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
+log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
+log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
+
+# For deploying Spark ThriftServer
+# SPARK-34128：Suppress undesirable TTransportException warnings involved in THRIFT-4805
+log4j.appender.console.filter.1=org.apache.log4j.varia.StringMatchFilter
+log4j.appender.console.filter.1.StringToMatch=Thrift error occurred during processing of message
+log4j.appender.console.filter.1.AcceptOnMatch=false
diff --git a/spark_conf/metrics.properties.template b/spark_conf/metrics.properties.template
new file mode 100644
index 0000000..f52d33f
--- /dev/null
+++ b/spark_conf/metrics.properties.template
@@ -0,0 +1,210 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#  syntax: [instance].sink|source.[name].[options]=[value]
+
+#  This file configures Spark's internal metrics system. The metrics system is
+#  divided into instances which correspond to internal components.
+#  Each instance can be configured to report its metrics to one or more sinks.
+#  Accepted values for [instance] are "master", "worker", "executor", "driver",
+#  and "applications". A wildcard "*" can be used as an instance name, in
+#  which case all instances will inherit the supplied property.
+#
+#  Within an instance, a "source" specifies a particular set of grouped metrics.
+#  there are two kinds of sources:
+#    1. Spark internal sources, like MasterSource, WorkerSource, etc, which will
+#    collect a Spark component's internal state. Each instance is paired with a
+#    Spark source that is added automatically.
+#    2. Common sources, like JvmSource, which will collect low level state.
+#    These can be added through configuration options and are then loaded
+#    using reflection.
+#
+#  A "sink" specifies where metrics are delivered to. Each instance can be
+#  assigned one or more sinks.
+#
+#  The sink|source field specifies whether the property relates to a sink or
+#  source.
+#
+#  The [name] field specifies the name of source or sink.
+#
+#  The [options] field is the specific property of this source or sink. The
+#  source or sink is responsible for parsing this property.
+#
+#  Notes:
+#    1. To add a new sink, set the "class" option to a fully qualified class
+#    name (see examples below).
+#    2. Some sinks involve a polling period. The minimum allowed polling period
+#    is 1 second.
+#    3. Wildcard properties can be overridden by more specific properties.
+#    For example, master.sink.console.period takes precedence over
+#    *.sink.console.period.
+#    4. A metrics specific configuration
+#    "spark.metrics.conf=${SPARK_HOME}/conf/metrics.properties" should be
+#    added to Java properties using -Dspark.metrics.conf=xxx if you want to
+#    customize metrics system. You can also put the file in ${SPARK_HOME}/conf
+#    and it will be loaded automatically.
+#    5. The MetricsServlet sink is added by default as a sink in the master,
+#    worker and driver, and you can send HTTP requests to the "/metrics/json"
+#    endpoint to get a snapshot of all the registered metrics in JSON format.
+#    For master, requests to the "/metrics/master/json" and
+#    "/metrics/applications/json" endpoints can be sent separately to get
+#    metrics snapshots of the master instance and applications. This
+#    MetricsServlet does not have to be configured.
+#    6. The metrics system can also be configured using Spark configuration
+#    parameters. The relevant parameter names are formed by adding the
+#    prefix "spark.metrics.conf." to the configuration entries detailed in
+#    this file (see examples below).
+
+## List of available common sources and their properties.
+
+# org.apache.spark.metrics.source.JvmSource
+#   Note: Currently, JvmSource is the only available common source.
+#         It can be added to an instance by setting the "class" option to its
+#         fully qualified class name (see examples below).
+
+## List of available sinks and their properties.
+
+# org.apache.spark.metrics.sink.ConsoleSink
+#   Name:   Default:   Description:
+#   period  10         Poll period
+#   unit    seconds    Unit of the poll period
+
+# org.apache.spark.metrics.sink.CSVSink
+#   Name:     Default:   Description:
+#   period    10         Poll period
+#   unit      seconds    Unit of the poll period
+#   directory /tmp       Where to store CSV files
+
+# org.apache.spark.metrics.sink.GangliaSink
+#   Name:     Default:   Description:
+#   host      NONE       Hostname or multicast group of the Ganglia server,
+#                        must be set
+#   port      NONE       Port of the Ganglia server(s), must be set
+#   period    10         Poll period
+#   unit      seconds    Unit of the poll period
+#   ttl       1          TTL of messages sent by Ganglia
+#   dmax      0          Lifetime in seconds of metrics (0 never expired)
+#   mode      multicast  Ganglia network mode ('unicast' or 'multicast')
+
+# org.apache.spark.metrics.sink.JmxSink
+
+# org.apache.spark.metrics.sink.MetricsServlet
+#   Name:     Default:   Description:
+#   path      VARIES*    Path prefix from the web server root
+#   sample    false      Whether to show entire set of samples for histograms
+#                        ('false' or 'true')
+#
+# * Default path is /metrics/json for all instances except the master. The
+#   master has two paths:
+#     /metrics/applications/json # App information
+#     /metrics/master/json       # Master information
+
+# org.apache.spark.metrics.sink.PrometheusServlet
+#   Name:     Default:   Description:
+#   path      VARIES*    Path prefix from the web server root
+#
+# * Default path is /metrics/prometheus for all instances except the master. The
+#   master has two paths:
+#     /metrics/applications/prometheus # App information
+#     /metrics/master/prometheus       # Master information
+
+# org.apache.spark.metrics.sink.GraphiteSink
+#   Name:     Default:      Description:
+#   host      NONE          Hostname of the Graphite server, must be set
+#   port      NONE          Port of the Graphite server, must be set
+#   period    10            Poll period
+#   unit      seconds       Unit of the poll period
+#   prefix    EMPTY STRING  Prefix to prepend to every metric's name
+#   protocol  tcp           Protocol ("tcp" or "udp") to use
+#   regex     NONE          Optional filter to send only metrics matching this regex string
+
+# org.apache.spark.metrics.sink.StatsdSink
+#   Name:     Default:      Description:
+#   host      127.0.0.1     Hostname or IP of StatsD server
+#   port      8125          Port of StatsD server
+#   period    10            Poll period
+#   unit      seconds       Units of poll period
+#   prefix    EMPTY STRING  Prefix to prepend to metric name
+
+## Examples
+# Enable JmxSink for all instances by class name
+#*.sink.jmx.class=org.apache.spark.metrics.sink.JmxSink
+
+# Enable ConsoleSink for all instances by class name
+#*.sink.console.class=org.apache.spark.metrics.sink.ConsoleSink
+
+# Enable StatsdSink for all instances by class name
+#*.sink.statsd.class=org.apache.spark.metrics.sink.StatsdSink
+#*.sink.statsd.prefix=spark
+
+# Polling period for the ConsoleSink
+#*.sink.console.period=10
+# Unit of the polling period for the ConsoleSink
+#*.sink.console.unit=seconds
+
+# Polling period for the ConsoleSink specific for the master instance
+#master.sink.console.period=15
+# Unit of the polling period for the ConsoleSink specific for the master
+# instance
+#master.sink.console.unit=seconds
+
+# Enable CsvSink for all instances by class name
+#*.sink.csv.class=org.apache.spark.metrics.sink.CsvSink
+
+# Polling period for the CsvSink
+#*.sink.csv.period=1
+# Unit of the polling period for the CsvSink
+#*.sink.csv.unit=minutes
+
+# Polling directory for CsvSink
+#*.sink.csv.directory=/tmp/
+
+# Polling period for the CsvSink specific for the worker instance
+#worker.sink.csv.period=10
+# Unit of the polling period for the CsvSink specific for the worker instance
+#worker.sink.csv.unit=minutes
+
+# Enable Slf4jSink for all instances by class name
+#*.sink.slf4j.class=org.apache.spark.metrics.sink.Slf4jSink
+
+# Polling period for the Slf4JSink
+#*.sink.slf4j.period=1
+# Unit of the polling period for the Slf4jSink
+#*.sink.slf4j.unit=minutes
+
+# Example configuration for Graphite sink
+#*.sink.graphite.class=org.apache.spark.metrics.sink.GraphiteSink
+#*.sink.graphite.host=<graphiteEndPoint_hostName>
+#*.sink.graphite.port=<listening_port>
+#*.sink.graphite.period=10
+#*.sink.graphite.unit=seconds
+#*.sink.graphite.prefix=<optional_value>
+
+# Enable JvmSource for instance master, worker, driver and executor
+#master.source.jvm.class=org.apache.spark.metrics.source.JvmSource
+
+#worker.source.jvm.class=org.apache.spark.metrics.source.JvmSource
+
+#driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource
+
+#executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource
+
+# Example configuration for PrometheusServlet
+#*.sink.prometheusServlet.class=org.apache.spark.metrics.sink.PrometheusServlet
+#*.sink.prometheusServlet.path=/metrics/prometheus
+#master.sink.prometheusServlet.path=/metrics/master/prometheus
+#applications.sink.prometheusServlet.path=/metrics/applications/prometheus
diff --git a/spark_conf/spark-defaults.conf b/spark_conf/spark-defaults.conf
new file mode 100644
index 0000000..678d9c8
--- /dev/null
+++ b/spark_conf/spark-defaults.conf
@@ -0,0 +1,39 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Default system properties included when running spark-submit.
+# This is useful for setting default environmental settings.
+
+# Example:
+# spark.master                     spark://master:7077
+# spark.eventLog.enabled           true
+# spark.eventLog.dir               hdfs://namenode:8021/directory
+# spark.serializer                 org.apache.spark.serializer.KryoSerializer
+# spark.driver.memory              5g
+# spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
+
+# spark.io.compression.codec	  lzf
+# spark.io.compression.codec	  org.apache.spark.io.SnappyCompressionCodec
+# spark.eventLog.compress			  false
+
+spark.eventLog.enabled          		true
+spark.eventLog.dir              		hdfs://localhost:/logs/spark
+spark.history.fs.logDirectory   		hdfs://localhost:/logs/spark
+
+
+#
+#
diff --git a/spark_conf/spark-defaults.conf.template b/spark_conf/spark-defaults.conf.template
new file mode 100644
index 0000000..19cba6e
--- /dev/null
+++ b/spark_conf/spark-defaults.conf.template
@@ -0,0 +1,27 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Default system properties included when running spark-submit.
+# This is useful for setting default environmental settings.
+
+# Example:
+# spark.master                     spark://master:7077
+# spark.eventLog.enabled           true
+# spark.eventLog.dir               hdfs://namenode:8021/directory
+# spark.serializer                 org.apache.spark.serializer.KryoSerializer
+# spark.driver.memory              5g
+# spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
diff --git a/spark_conf/spark-env.sh b/spark_conf/spark-env.sh
new file mode 100644
index 0000000..1d67e58
--- /dev/null
+++ b/spark_conf/spark-env.sh
@@ -0,0 +1,7 @@
+export JAVA_HOME="/nix/store/d0akdmr675jrlabv7n8syg8yrg1zlyxz-openjdk-8u272-b10"
+export SPARK_HOME="/nix/store/zhj5q1pi0bs2lpc0lbkw8qkg03ywx9b8-spark-3.1.2/lib/spark-3.1.2"
+export SPARK_DIST_CLASSPATH=/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/common/lib/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/common/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/hdfs:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/hdfs/lib/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/hdfs/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/mapreduce/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/yarn:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/yarn/lib/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/yarn/*
+export PYSPARK_PYTHON="/nix/store/2c9w4p2x6x0l64fdvcmc11app7x4xran-python3-3.9.6/bin/python3.9"
+export PYTHONPATH="$PYTHONPATH:/nix/store/2c9w4p2x6x0l64fdvcmc11app7x4xran-python3-3.9.6/lib/python3.9/site-packages"
+export SPARKR_R_SHELL="/nix/store/h1s3y5jjrwdm5gd2qyxp2ldsnykippcb-R-4.1.2/bin/R"
+export PATH="$PATH:/nix/store/h1s3y5jjrwdm5gd2qyxp2ldsnykippcb-R-4.1.2/bin"
diff --git a/spark_conf/spark-env.sh.template b/spark_conf/spark-env.sh.template
new file mode 100755
index 0000000..c868650
--- /dev/null
+++ b/spark_conf/spark-env.sh.template
@@ -0,0 +1,73 @@
+#!/nix/store/vfai0jim0db67nk9rd7ziq29jxb5n79n-bash-5.1-p8/bin/bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This file is sourced when running various Spark programs.
+# Copy it as spark-env.sh and edit that to configure Spark for your site.
+
+# Options read when launching programs locally with
+# ./bin/run-example or ./bin/spark-submit
+# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
+# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
+# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program
+
+# Options read by executors and drivers running inside the cluster
+# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
+# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program
+# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data
+# - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos
+
+# Options read in YARN client/cluster mode
+# - SPARK_CONF_DIR, Alternate conf dir. (Default: ${SPARK_HOME}/conf)
+# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
+# - YARN_CONF_DIR, to point Spark towards YARN configuration files when you use YARN
+# - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1).
+# - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G)
+# - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G)
+
+# Options for the daemons used in the standalone deploy mode
+# - SPARK_MASTER_HOST, to bind the master to a different IP address or hostname
+# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master
+# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y")
+# - SPARK_WORKER_CORES, to set the number of cores to use on this machine
+# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g)
+# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker
+# - SPARK_WORKER_DIR, to set the working directory of worker processes
+# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y")
+# - SPARK_DAEMON_MEMORY, to allocate to the master, worker and history server themselves (default: 1g).
+# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y")
+# - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y")
+# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y")
+# - SPARK_DAEMON_CLASSPATH, to set the classpath for all daemons
+# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers
+
+# Options for launcher
+# - SPARK_LAUNCHER_OPTS, to set config properties and Java options for the launcher (e.g. "-Dx=y")
+
+# Generic options for the daemons used in the standalone deploy mode
+# - SPARK_CONF_DIR      Alternate conf dir. (Default: ${SPARK_HOME}/conf)
+# - SPARK_LOG_DIR       Where log files are stored.  (Default: ${SPARK_HOME}/logs)
+# - SPARK_LOG_MAX_FILES Max log files of Spark daemons can rotate to. Default is 5.
+# - SPARK_PID_DIR       Where the pid file is stored. (Default: /tmp)
+# - SPARK_IDENT_STRING  A string representing this instance of spark. (Default: $USER)
+# - SPARK_NICENESS      The scheduling priority for daemons. (Default: 0)
+# - SPARK_NO_DAEMONIZE  Run the proposed command in the foreground. It will not output a PID file.
+# Options for native BLAS, like Intel MKL, OpenBLAS, and so on.
+# You might get better performance to enable these options if using native BLAS (see SPARK-21305).
+# - MKL_NUM_THREADS=1        Disable multi-threading of Intel MKL
+# - OPENBLAS_NUM_THREADS=1   Disable multi-threading of OpenBLAS
diff --git a/spark_conf/workers.template b/spark_conf/workers.template
new file mode 100644
index 0000000..be42a63
--- /dev/null
+++ b/spark_conf/workers.template
@@ -0,0 +1,19 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# A Spark Worker will be started on each of the machines listed below.
+localhost
\ No newline at end of file

From 9e4879f7a0f8b36015d4b91c59bfe540f8d42d90 Mon Sep 17 00:00:00 2001
From: Filippo Berto <berto.f@protonmail.com>
Date: Mon, 21 Feb 2022 14:37:23 +0100
Subject: [PATCH 02/12] Big data config for laptop

---
 nixos/base.nix                          | 120 +++++++++++++-
 spark_conf/fairscheduler.xml.template   |  31 ----
 spark_conf/log4j.properties             |  46 ------
 spark_conf/log4j.properties.template    |  46 ------
 spark_conf/metrics.properties.template  | 210 ------------------------
 spark_conf/spark-defaults.conf          |  39 -----
 spark_conf/spark-defaults.conf.template |  27 ---
 spark_conf/spark-env.sh                 |   7 -
 spark_conf/spark-env.sh.template        |  73 --------
 spark_conf/workers.template             |  19 ---
 10 files changed, 119 insertions(+), 499 deletions(-)
 delete mode 100644 spark_conf/fairscheduler.xml.template
 delete mode 100644 spark_conf/log4j.properties
 delete mode 100644 spark_conf/log4j.properties.template
 delete mode 100644 spark_conf/metrics.properties.template
 delete mode 100644 spark_conf/spark-defaults.conf
 delete mode 100644 spark_conf/spark-defaults.conf.template
 delete mode 100644 spark_conf/spark-env.sh
 delete mode 100755 spark_conf/spark-env.sh.template
 delete mode 100644 spark_conf/workers.template

diff --git a/nixos/base.nix b/nixos/base.nix
index 4896ec9..4a68410 100644
--- a/nixos/base.nix
+++ b/nixos/base.nix
@@ -1,4 +1,46 @@
-{ config, pkgs, lib, ... }: {
+{ config, pkgs, lib, ... }:
+
+let sparkConfDir = pkgs.stdenv.mkDerivation {
+  name = "spark-config";
+  dontUnpack = true;
+  installPhase = ''
+    # source standard environment
+    . $stdenv/setup
+
+    # shorthands
+    base_conf=${pkgs.spark}/lib/${pkgs.spark.untarDir}/conf/
+
+    # create output dirs for new derivation
+    mkdir -p $out/
+
+    # link unchanged files from the original gnome-session
+    for f in $base_conf/*.template ; do
+       ln -sf $f $out/
+    done
+
+    # change selected files
+    cp $out/log4j.properties{.template,}
+
+    cat > $out/spark-env.sh <<- STOP
+    export JAVA_HOME="${pkgs.jdk8}"
+    export SPARK_HOME="${pkgs.spark}/lib/${pkgs.spark.untarDir}"
+    export SPARK_DIST_CLASSPATH=$(${pkgs.hadoop}/bin/hadoop classpath)
+    export PYSPARK_PYTHON="${pkgs.python3Packages.python}/bin/${pkgs.python3Packages.python.executable}"
+    export PYTHONPATH="\$PYTHONPATH:$PYTHONPATH"
+    export SPARKR_R_SHELL="${pkgs.R}/bin/R"
+    export PATH="\$PATH:${pkgs.R}/bin"
+    STOP
+
+    cat > $out/spark-defaults.conf <<- STOP
+    spark.eventLog.enabled                  true
+    spark.eventLog.dir                      hdfs://localhost:/logs/spark
+    spark.history.fs.logDirectory           hdfs://localhost:/logs/spark
+    STOP
+  '';
+};
+in
+
+{
 
   imports = [
     <nixos-hardware/common/cpu/intel>
@@ -210,6 +252,82 @@
   services.power-profiles-daemon.enable = true;
 
   # services.teamviewer.enable = true;
+  services = {
+
+    spark = {
+      master = {
+        enable = true;
+        restartIfChanged = true;
+      };
+      worker = {
+        enable = true;
+        restartIfChanged = true;
+      };
+      confDir = sparkConfDir;
+    };
+
+    hadoop = {
+      coreSite = {
+        "fs.defaultFS" = "hdfs://localhost:8020";
+      };
+      hdfsSite = {
+        "dfs.namenode.rpc-bind-host" = "0.0.0.0";
+        "dfs.permissions" = "false";
+      };
+
+      hdfs = {
+        namenode = {
+          enable = true;
+          formatOnInit = true;
+          restartIfChanged = true;
+        };
+        datanode = {
+          enable = true;
+          restartIfChanged = true;
+        };
+        journalnode = {
+          enable = true;
+          restartIfChanged = true;
+        };
+        zkfc = {
+          enable = true;
+          restartIfChanged = true;
+        };
+        httpfs = {
+          enable = true;
+          restartIfChanged = true;
+        };
+      };
+      yarn = {
+        resourcemanager.enable = true;
+        nodemanager.enable = true;
+      };
+    };
+  };
+
+  systemd.services.spark-history = {
+    path = with pkgs; [ procps openssh nettools ];
+    description = "spark history service.";
+    after = [ "network.target" ];
+    wantedBy = [ "multi-user.target" ];
+    restartIfChanged = true;
+    environment = {
+      SPARK_CONF_DIR = sparkConfDir;
+      SPARK_LOG_DIR = "/var/log/spark";
+    };
+    serviceConfig = {
+      Type = "forking";
+      User = "spark";
+      Group = "spark";
+      WorkingDirectory = "${pkgs.spark}/lib/${pkgs.spark.untarDir}";
+      ExecStart = "${pkgs.spark}/lib/${pkgs.spark.untarDir}/sbin/start-history-server.sh";
+      ExecStop = "${pkgs.spark}/lib/${pkgs.spark.untarDir}/sbin/stop-history-server.sh";
+      TimeoutSec = 300;
+      StartLimitBurst = 10;
+      Restart = "always";
+    };
+  };
+
 
   # Virtualisation
   virtualisation = {
diff --git a/spark_conf/fairscheduler.xml.template b/spark_conf/fairscheduler.xml.template
deleted file mode 100644
index 385b2e7..0000000
--- a/spark_conf/fairscheduler.xml.template
+++ /dev/null
@@ -1,31 +0,0 @@
-<?xml version="1.0"?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<allocations>
-  <pool name="production">
-    <schedulingMode>FAIR</schedulingMode>
-    <weight>1</weight>
-    <minShare>2</minShare>
-  </pool>
-  <pool name="test">
-    <schedulingMode>FIFO</schedulingMode>
-    <weight>2</weight>
-    <minShare>3</minShare>
-  </pool>
-</allocations>
diff --git a/spark_conf/log4j.properties b/spark_conf/log4j.properties
deleted file mode 100644
index dc7b9ea..0000000
--- a/spark_conf/log4j.properties
+++ /dev/null
@@ -1,46 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Set everything to be logged to the console
-log4j.rootCategory=INFO, console
-log4j.appender.console=org.apache.log4j.ConsoleAppender
-log4j.appender.console.target=System.err
-log4j.appender.console.layout=org.apache.log4j.PatternLayout
-log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
-
-# Set the default spark-shell log level to WARN. When running the spark-shell, the
-# log level for this class is used to overwrite the root logger's log level, so that
-# the user can have different defaults for the shell and regular Spark apps.
-log4j.logger.org.apache.spark.repl.Main=WARN
-
-# Settings to quiet third party logs that are too verbose
-log4j.logger.org.sparkproject.jetty=WARN
-log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR
-log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
-log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
-log4j.logger.org.apache.parquet=ERROR
-log4j.logger.parquet=ERROR
-
-# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
-log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
-log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
-
-# For deploying Spark ThriftServer
-# SPARK-34128：Suppress undesirable TTransportException warnings involved in THRIFT-4805
-log4j.appender.console.filter.1=org.apache.log4j.varia.StringMatchFilter
-log4j.appender.console.filter.1.StringToMatch=Thrift error occurred during processing of message
-log4j.appender.console.filter.1.AcceptOnMatch=false
diff --git a/spark_conf/log4j.properties.template b/spark_conf/log4j.properties.template
deleted file mode 100644
index dc7b9ea..0000000
--- a/spark_conf/log4j.properties.template
+++ /dev/null
@@ -1,46 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Set everything to be logged to the console
-log4j.rootCategory=INFO, console
-log4j.appender.console=org.apache.log4j.ConsoleAppender
-log4j.appender.console.target=System.err
-log4j.appender.console.layout=org.apache.log4j.PatternLayout
-log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
-
-# Set the default spark-shell log level to WARN. When running the spark-shell, the
-# log level for this class is used to overwrite the root logger's log level, so that
-# the user can have different defaults for the shell and regular Spark apps.
-log4j.logger.org.apache.spark.repl.Main=WARN
-
-# Settings to quiet third party logs that are too verbose
-log4j.logger.org.sparkproject.jetty=WARN
-log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR
-log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
-log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
-log4j.logger.org.apache.parquet=ERROR
-log4j.logger.parquet=ERROR
-
-# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
-log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
-log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
-
-# For deploying Spark ThriftServer
-# SPARK-34128：Suppress undesirable TTransportException warnings involved in THRIFT-4805
-log4j.appender.console.filter.1=org.apache.log4j.varia.StringMatchFilter
-log4j.appender.console.filter.1.StringToMatch=Thrift error occurred during processing of message
-log4j.appender.console.filter.1.AcceptOnMatch=false
diff --git a/spark_conf/metrics.properties.template b/spark_conf/metrics.properties.template
deleted file mode 100644
index f52d33f..0000000
--- a/spark_conf/metrics.properties.template
+++ /dev/null
@@ -1,210 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#  syntax: [instance].sink|source.[name].[options]=[value]
-
-#  This file configures Spark's internal metrics system. The metrics system is
-#  divided into instances which correspond to internal components.
-#  Each instance can be configured to report its metrics to one or more sinks.
-#  Accepted values for [instance] are "master", "worker", "executor", "driver",
-#  and "applications". A wildcard "*" can be used as an instance name, in
-#  which case all instances will inherit the supplied property.
-#
-#  Within an instance, a "source" specifies a particular set of grouped metrics.
-#  there are two kinds of sources:
-#    1. Spark internal sources, like MasterSource, WorkerSource, etc, which will
-#    collect a Spark component's internal state. Each instance is paired with a
-#    Spark source that is added automatically.
-#    2. Common sources, like JvmSource, which will collect low level state.
-#    These can be added through configuration options and are then loaded
-#    using reflection.
-#
-#  A "sink" specifies where metrics are delivered to. Each instance can be
-#  assigned one or more sinks.
-#
-#  The sink|source field specifies whether the property relates to a sink or
-#  source.
-#
-#  The [name] field specifies the name of source or sink.
-#
-#  The [options] field is the specific property of this source or sink. The
-#  source or sink is responsible for parsing this property.
-#
-#  Notes:
-#    1. To add a new sink, set the "class" option to a fully qualified class
-#    name (see examples below).
-#    2. Some sinks involve a polling period. The minimum allowed polling period
-#    is 1 second.
-#    3. Wildcard properties can be overridden by more specific properties.
-#    For example, master.sink.console.period takes precedence over
-#    *.sink.console.period.
-#    4. A metrics specific configuration
-#    "spark.metrics.conf=${SPARK_HOME}/conf/metrics.properties" should be
-#    added to Java properties using -Dspark.metrics.conf=xxx if you want to
-#    customize metrics system. You can also put the file in ${SPARK_HOME}/conf
-#    and it will be loaded automatically.
-#    5. The MetricsServlet sink is added by default as a sink in the master,
-#    worker and driver, and you can send HTTP requests to the "/metrics/json"
-#    endpoint to get a snapshot of all the registered metrics in JSON format.
-#    For master, requests to the "/metrics/master/json" and
-#    "/metrics/applications/json" endpoints can be sent separately to get
-#    metrics snapshots of the master instance and applications. This
-#    MetricsServlet does not have to be configured.
-#    6. The metrics system can also be configured using Spark configuration
-#    parameters. The relevant parameter names are formed by adding the
-#    prefix "spark.metrics.conf." to the configuration entries detailed in
-#    this file (see examples below).
-
-## List of available common sources and their properties.
-
-# org.apache.spark.metrics.source.JvmSource
-#   Note: Currently, JvmSource is the only available common source.
-#         It can be added to an instance by setting the "class" option to its
-#         fully qualified class name (see examples below).
-
-## List of available sinks and their properties.
-
-# org.apache.spark.metrics.sink.ConsoleSink
-#   Name:   Default:   Description:
-#   period  10         Poll period
-#   unit    seconds    Unit of the poll period
-
-# org.apache.spark.metrics.sink.CSVSink
-#   Name:     Default:   Description:
-#   period    10         Poll period
-#   unit      seconds    Unit of the poll period
-#   directory /tmp       Where to store CSV files
-
-# org.apache.spark.metrics.sink.GangliaSink
-#   Name:     Default:   Description:
-#   host      NONE       Hostname or multicast group of the Ganglia server,
-#                        must be set
-#   port      NONE       Port of the Ganglia server(s), must be set
-#   period    10         Poll period
-#   unit      seconds    Unit of the poll period
-#   ttl       1          TTL of messages sent by Ganglia
-#   dmax      0          Lifetime in seconds of metrics (0 never expired)
-#   mode      multicast  Ganglia network mode ('unicast' or 'multicast')
-
-# org.apache.spark.metrics.sink.JmxSink
-
-# org.apache.spark.metrics.sink.MetricsServlet
-#   Name:     Default:   Description:
-#   path      VARIES*    Path prefix from the web server root
-#   sample    false      Whether to show entire set of samples for histograms
-#                        ('false' or 'true')
-#
-# * Default path is /metrics/json for all instances except the master. The
-#   master has two paths:
-#     /metrics/applications/json # App information
-#     /metrics/master/json       # Master information
-
-# org.apache.spark.metrics.sink.PrometheusServlet
-#   Name:     Default:   Description:
-#   path      VARIES*    Path prefix from the web server root
-#
-# * Default path is /metrics/prometheus for all instances except the master. The
-#   master has two paths:
-#     /metrics/applications/prometheus # App information
-#     /metrics/master/prometheus       # Master information
-
-# org.apache.spark.metrics.sink.GraphiteSink
-#   Name:     Default:      Description:
-#   host      NONE          Hostname of the Graphite server, must be set
-#   port      NONE          Port of the Graphite server, must be set
-#   period    10            Poll period
-#   unit      seconds       Unit of the poll period
-#   prefix    EMPTY STRING  Prefix to prepend to every metric's name
-#   protocol  tcp           Protocol ("tcp" or "udp") to use
-#   regex     NONE          Optional filter to send only metrics matching this regex string
-
-# org.apache.spark.metrics.sink.StatsdSink
-#   Name:     Default:      Description:
-#   host      127.0.0.1     Hostname or IP of StatsD server
-#   port      8125          Port of StatsD server
-#   period    10            Poll period
-#   unit      seconds       Units of poll period
-#   prefix    EMPTY STRING  Prefix to prepend to metric name
-
-## Examples
-# Enable JmxSink for all instances by class name
-#*.sink.jmx.class=org.apache.spark.metrics.sink.JmxSink
-
-# Enable ConsoleSink for all instances by class name
-#*.sink.console.class=org.apache.spark.metrics.sink.ConsoleSink
-
-# Enable StatsdSink for all instances by class name
-#*.sink.statsd.class=org.apache.spark.metrics.sink.StatsdSink
-#*.sink.statsd.prefix=spark
-
-# Polling period for the ConsoleSink
-#*.sink.console.period=10
-# Unit of the polling period for the ConsoleSink
-#*.sink.console.unit=seconds
-
-# Polling period for the ConsoleSink specific for the master instance
-#master.sink.console.period=15
-# Unit of the polling period for the ConsoleSink specific for the master
-# instance
-#master.sink.console.unit=seconds
-
-# Enable CsvSink for all instances by class name
-#*.sink.csv.class=org.apache.spark.metrics.sink.CsvSink
-
-# Polling period for the CsvSink
-#*.sink.csv.period=1
-# Unit of the polling period for the CsvSink
-#*.sink.csv.unit=minutes
-
-# Polling directory for CsvSink
-#*.sink.csv.directory=/tmp/
-
-# Polling period for the CsvSink specific for the worker instance
-#worker.sink.csv.period=10
-# Unit of the polling period for the CsvSink specific for the worker instance
-#worker.sink.csv.unit=minutes
-
-# Enable Slf4jSink for all instances by class name
-#*.sink.slf4j.class=org.apache.spark.metrics.sink.Slf4jSink
-
-# Polling period for the Slf4JSink
-#*.sink.slf4j.period=1
-# Unit of the polling period for the Slf4jSink
-#*.sink.slf4j.unit=minutes
-
-# Example configuration for Graphite sink
-#*.sink.graphite.class=org.apache.spark.metrics.sink.GraphiteSink
-#*.sink.graphite.host=<graphiteEndPoint_hostName>
-#*.sink.graphite.port=<listening_port>
-#*.sink.graphite.period=10
-#*.sink.graphite.unit=seconds
-#*.sink.graphite.prefix=<optional_value>
-
-# Enable JvmSource for instance master, worker, driver and executor
-#master.source.jvm.class=org.apache.spark.metrics.source.JvmSource
-
-#worker.source.jvm.class=org.apache.spark.metrics.source.JvmSource
-
-#driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource
-
-#executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource
-
-# Example configuration for PrometheusServlet
-#*.sink.prometheusServlet.class=org.apache.spark.metrics.sink.PrometheusServlet
-#*.sink.prometheusServlet.path=/metrics/prometheus
-#master.sink.prometheusServlet.path=/metrics/master/prometheus
-#applications.sink.prometheusServlet.path=/metrics/applications/prometheus
diff --git a/spark_conf/spark-defaults.conf b/spark_conf/spark-defaults.conf
deleted file mode 100644
index 678d9c8..0000000
--- a/spark_conf/spark-defaults.conf
+++ /dev/null
@@ -1,39 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Default system properties included when running spark-submit.
-# This is useful for setting default environmental settings.
-
-# Example:
-# spark.master                     spark://master:7077
-# spark.eventLog.enabled           true
-# spark.eventLog.dir               hdfs://namenode:8021/directory
-# spark.serializer                 org.apache.spark.serializer.KryoSerializer
-# spark.driver.memory              5g
-# spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
-
-# spark.io.compression.codec	  lzf
-# spark.io.compression.codec	  org.apache.spark.io.SnappyCompressionCodec
-# spark.eventLog.compress			  false
-
-spark.eventLog.enabled          		true
-spark.eventLog.dir              		hdfs://localhost:/logs/spark
-spark.history.fs.logDirectory   		hdfs://localhost:/logs/spark
-
-
-#
-#
diff --git a/spark_conf/spark-defaults.conf.template b/spark_conf/spark-defaults.conf.template
deleted file mode 100644
index 19cba6e..0000000
--- a/spark_conf/spark-defaults.conf.template
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Default system properties included when running spark-submit.
-# This is useful for setting default environmental settings.
-
-# Example:
-# spark.master                     spark://master:7077
-# spark.eventLog.enabled           true
-# spark.eventLog.dir               hdfs://namenode:8021/directory
-# spark.serializer                 org.apache.spark.serializer.KryoSerializer
-# spark.driver.memory              5g
-# spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
diff --git a/spark_conf/spark-env.sh b/spark_conf/spark-env.sh
deleted file mode 100644
index 1d67e58..0000000
--- a/spark_conf/spark-env.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-export JAVA_HOME="/nix/store/d0akdmr675jrlabv7n8syg8yrg1zlyxz-openjdk-8u272-b10"
-export SPARK_HOME="/nix/store/zhj5q1pi0bs2lpc0lbkw8qkg03ywx9b8-spark-3.1.2/lib/spark-3.1.2"
-export SPARK_DIST_CLASSPATH=/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/common/lib/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/common/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/hdfs:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/hdfs/lib/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/hdfs/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/mapreduce/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/yarn:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/yarn/lib/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/yarn/*
-export PYSPARK_PYTHON="/nix/store/2c9w4p2x6x0l64fdvcmc11app7x4xran-python3-3.9.6/bin/python3.9"
-export PYTHONPATH="$PYTHONPATH:/nix/store/2c9w4p2x6x0l64fdvcmc11app7x4xran-python3-3.9.6/lib/python3.9/site-packages"
-export SPARKR_R_SHELL="/nix/store/h1s3y5jjrwdm5gd2qyxp2ldsnykippcb-R-4.1.2/bin/R"
-export PATH="$PATH:/nix/store/h1s3y5jjrwdm5gd2qyxp2ldsnykippcb-R-4.1.2/bin"
diff --git a/spark_conf/spark-env.sh.template b/spark_conf/spark-env.sh.template
deleted file mode 100755
index c868650..0000000
--- a/spark_conf/spark-env.sh.template
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/nix/store/vfai0jim0db67nk9rd7ziq29jxb5n79n-bash-5.1-p8/bin/bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# This file is sourced when running various Spark programs.
-# Copy it as spark-env.sh and edit that to configure Spark for your site.
-
-# Options read when launching programs locally with
-# ./bin/run-example or ./bin/spark-submit
-# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
-# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
-# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program
-
-# Options read by executors and drivers running inside the cluster
-# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
-# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program
-# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data
-# - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos
-
-# Options read in YARN client/cluster mode
-# - SPARK_CONF_DIR, Alternate conf dir. (Default: ${SPARK_HOME}/conf)
-# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
-# - YARN_CONF_DIR, to point Spark towards YARN configuration files when you use YARN
-# - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1).
-# - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G)
-# - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G)
-
-# Options for the daemons used in the standalone deploy mode
-# - SPARK_MASTER_HOST, to bind the master to a different IP address or hostname
-# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master
-# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y")
-# - SPARK_WORKER_CORES, to set the number of cores to use on this machine
-# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g)
-# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker
-# - SPARK_WORKER_DIR, to set the working directory of worker processes
-# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y")
-# - SPARK_DAEMON_MEMORY, to allocate to the master, worker and history server themselves (default: 1g).
-# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y")
-# - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y")
-# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y")
-# - SPARK_DAEMON_CLASSPATH, to set the classpath for all daemons
-# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers
-
-# Options for launcher
-# - SPARK_LAUNCHER_OPTS, to set config properties and Java options for the launcher (e.g. "-Dx=y")
-
-# Generic options for the daemons used in the standalone deploy mode
-# - SPARK_CONF_DIR      Alternate conf dir. (Default: ${SPARK_HOME}/conf)
-# - SPARK_LOG_DIR       Where log files are stored.  (Default: ${SPARK_HOME}/logs)
-# - SPARK_LOG_MAX_FILES Max log files of Spark daemons can rotate to. Default is 5.
-# - SPARK_PID_DIR       Where the pid file is stored. (Default: /tmp)
-# - SPARK_IDENT_STRING  A string representing this instance of spark. (Default: $USER)
-# - SPARK_NICENESS      The scheduling priority for daemons. (Default: 0)
-# - SPARK_NO_DAEMONIZE  Run the proposed command in the foreground. It will not output a PID file.
-# Options for native BLAS, like Intel MKL, OpenBLAS, and so on.
-# You might get better performance to enable these options if using native BLAS (see SPARK-21305).
-# - MKL_NUM_THREADS=1        Disable multi-threading of Intel MKL
-# - OPENBLAS_NUM_THREADS=1   Disable multi-threading of OpenBLAS
diff --git a/spark_conf/workers.template b/spark_conf/workers.template
deleted file mode 100644
index be42a63..0000000
--- a/spark_conf/workers.template
+++ /dev/null
@@ -1,19 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# A Spark Worker will be started on each of the machines listed below.
-localhost
\ No newline at end of file

From 5a0f9cc5b4b15d4167da00e1997cf15e59afc79c Mon Sep 17 00:00:00 2001
From: Filippo Berto <berto.f@protonmail.com>
Date: Fri, 11 Mar 2022 12:27:00 +0100
Subject: [PATCH 03/12] Use ssd storage

---
 nixos/base.nix | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/nixos/base.nix b/nixos/base.nix
index 4a68410..69b4d73 100644
--- a/nixos/base.nix
+++ b/nixos/base.nix
@@ -273,6 +273,8 @@ in
       hdfsSite = {
         "dfs.namenode.rpc-bind-host" = "0.0.0.0";
         "dfs.permissions" = "false";
+        "dfs.namenode.name.dir" = "/hdfs/dfs/name";
+        "dfs.datanode.data.dir" = "/hdfs/dfs/data";
       };
 
       hdfs = {

From 1680b2bc455e86658b47ee6e090ed93d90b96383 Mon Sep 17 00:00:00 2001
From: Filippo Berto <berto.f@protonmail.com>
Date: Mon, 21 Mar 2022 09:50:26 +0100
Subject: [PATCH 04/12] Centralized big_data config

---
 nixos/base.nix     | 120 +--------------------------------------------
 nixos/big_data.nix | 120 +++++++++++++++++++++++++++++++++++++++++++++
 nixos/thor.nix     |  93 +----------------------------------
 3 files changed, 122 insertions(+), 211 deletions(-)
 create mode 100644 nixos/big_data.nix

diff --git a/nixos/base.nix b/nixos/base.nix
index 69b4d73..3240307 100644
--- a/nixos/base.nix
+++ b/nixos/base.nix
@@ -1,45 +1,4 @@
 { config, pkgs, lib, ... }:
-
-let sparkConfDir = pkgs.stdenv.mkDerivation {
-  name = "spark-config";
-  dontUnpack = true;
-  installPhase = ''
-    # source standard environment
-    . $stdenv/setup
-
-    # shorthands
-    base_conf=${pkgs.spark}/lib/${pkgs.spark.untarDir}/conf/
-
-    # create output dirs for new derivation
-    mkdir -p $out/
-
-    # link unchanged files from the original gnome-session
-    for f in $base_conf/*.template ; do
-       ln -sf $f $out/
-    done
-
-    # change selected files
-    cp $out/log4j.properties{.template,}
-
-    cat > $out/spark-env.sh <<- STOP
-    export JAVA_HOME="${pkgs.jdk8}"
-    export SPARK_HOME="${pkgs.spark}/lib/${pkgs.spark.untarDir}"
-    export SPARK_DIST_CLASSPATH=$(${pkgs.hadoop}/bin/hadoop classpath)
-    export PYSPARK_PYTHON="${pkgs.python3Packages.python}/bin/${pkgs.python3Packages.python.executable}"
-    export PYTHONPATH="\$PYTHONPATH:$PYTHONPATH"
-    export SPARKR_R_SHELL="${pkgs.R}/bin/R"
-    export PATH="\$PATH:${pkgs.R}/bin"
-    STOP
-
-    cat > $out/spark-defaults.conf <<- STOP
-    spark.eventLog.enabled                  true
-    spark.eventLog.dir                      hdfs://localhost:/logs/spark
-    spark.history.fs.logDirectory           hdfs://localhost:/logs/spark
-    STOP
-  '';
-};
-in
-
 {
 
   imports = [
@@ -53,6 +12,7 @@ in
     # ./defcon.nix
     # ./mind.nix
     # ./k3s.nix
+    ./big_data.nix
   ];
 
   # Use the systemd-boot EFI boot loader.
@@ -252,84 +212,6 @@ in
   services.power-profiles-daemon.enable = true;
 
   # services.teamviewer.enable = true;
-  services = {
-
-    spark = {
-      master = {
-        enable = true;
-        restartIfChanged = true;
-      };
-      worker = {
-        enable = true;
-        restartIfChanged = true;
-      };
-      confDir = sparkConfDir;
-    };
-
-    hadoop = {
-      coreSite = {
-        "fs.defaultFS" = "hdfs://localhost:8020";
-      };
-      hdfsSite = {
-        "dfs.namenode.rpc-bind-host" = "0.0.0.0";
-        "dfs.permissions" = "false";
-        "dfs.namenode.name.dir" = "/hdfs/dfs/name";
-        "dfs.datanode.data.dir" = "/hdfs/dfs/data";
-      };
-
-      hdfs = {
-        namenode = {
-          enable = true;
-          formatOnInit = true;
-          restartIfChanged = true;
-        };
-        datanode = {
-          enable = true;
-          restartIfChanged = true;
-        };
-        journalnode = {
-          enable = true;
-          restartIfChanged = true;
-        };
-        zkfc = {
-          enable = true;
-          restartIfChanged = true;
-        };
-        httpfs = {
-          enable = true;
-          restartIfChanged = true;
-        };
-      };
-      yarn = {
-        resourcemanager.enable = true;
-        nodemanager.enable = true;
-      };
-    };
-  };
-
-  systemd.services.spark-history = {
-    path = with pkgs; [ procps openssh nettools ];
-    description = "spark history service.";
-    after = [ "network.target" ];
-    wantedBy = [ "multi-user.target" ];
-    restartIfChanged = true;
-    environment = {
-      SPARK_CONF_DIR = sparkConfDir;
-      SPARK_LOG_DIR = "/var/log/spark";
-    };
-    serviceConfig = {
-      Type = "forking";
-      User = "spark";
-      Group = "spark";
-      WorkingDirectory = "${pkgs.spark}/lib/${pkgs.spark.untarDir}";
-      ExecStart = "${pkgs.spark}/lib/${pkgs.spark.untarDir}/sbin/start-history-server.sh";
-      ExecStop = "${pkgs.spark}/lib/${pkgs.spark.untarDir}/sbin/stop-history-server.sh";
-      TimeoutSec = 300;
-      StartLimitBurst = 10;
-      Restart = "always";
-    };
-  };
-
 
   # Virtualisation
   virtualisation = {
diff --git a/nixos/big_data.nix b/nixos/big_data.nix
new file mode 100644
index 0000000..3d61957
--- /dev/null
+++ b/nixos/big_data.nix
@@ -0,0 +1,120 @@
+{ config, lib, pkgs, ... }:
+let sparkConfDir = pkgs.stdenv.mkDerivation {
+  name = "spark-config";
+  dontUnpack = true;
+  installPhase = ''
+    # source standard environment
+    . $stdenv/setup
+
+    # shorthands
+    base_conf=${pkgs.spark}/lib/${pkgs.spark.untarDir}/conf/
+
+    # create output dirs for new derivation
+    mkdir -p $out/
+
+    # link unchanged files from the original gnome-session
+    for f in $base_conf/*.template ; do
+       ln -sf $f $out/
+    done
+
+    # change selected files
+    cp $out/log4j.properties{.template,}
+
+    cat > $out/spark-env.sh <<- STOP
+    export JAVA_HOME="${pkgs.jdk8}"
+    export SPARK_HOME="${pkgs.spark}/lib/${pkgs.spark.untarDir}"
+    export SPARK_DIST_CLASSPATH=$(${pkgs.hadoop}/bin/hadoop classpath)
+    export PYSPARK_PYTHON="${pkgs.python3Packages.python}/bin/${pkgs.python3Packages.python.executable}"
+    export PYTHONPATH="\$PYTHONPATH:$PYTHONPATH"
+    export SPARKR_R_SHELL="${pkgs.R}/bin/R"
+    export PATH="\$PATH:${pkgs.R}/bin"
+    STOP
+
+    cat > $out/spark-defaults.conf <<- STOP
+    spark.eventLog.enabled          		true
+    spark.eventLog.dir              		hdfs://localhost:/logs/spark
+    spark.history.fs.logDirectory   		hdfs://localhost:/logs/spark
+    STOP
+  '';
+};
+in
+{
+
+  services = {
+    spark = {
+      master = {
+        enable = true;
+        restartIfChanged = true;
+      };
+      worker = {
+        enable = true;
+        restartIfChanged = true;
+      };
+      confDir = sparkConfDir;
+    };
+
+    hadoop = {
+      coreSite = {
+        "fs.defaultFS" = "hdfs://localhost:8020";
+      };
+      hdfsSite = {
+        "dfs.namenode.rpc-bind-host" = "0.0.0.0";
+        "dfs.permissions" = "false";
+        "dfs.namenode.name.dir" = "/hdfs/dfs/name";
+        "dfs.datanode.data.dir" = "/hdfs/dfs/data";
+      };
+
+      hdfs = {
+        namenode = {
+          enable = true;
+          formatOnInit = true;
+          restartIfChanged = true;
+        };
+        datanode = {
+          enable = true;
+          restartIfChanged = true;
+        };
+        journalnode = {
+          enable = true;
+          restartIfChanged = true;
+        };
+        zkfc = {
+          enable = true;
+          restartIfChanged = true;
+        };
+        httpfs = {
+          enable = true;
+          restartIfChanged = true;
+        };
+      };
+      yarn = {
+        resourcemanager.enable = true;
+        nodemanager.enable = true;
+      };
+    };
+  };
+
+  systemd.services.spark-history = {
+    path = with pkgs; [ procps openssh nettools ];
+    description = "spark history service.";
+    after = [ "network.target" ];
+    wantedBy = [ "multi-user.target" ];
+    restartIfChanged = true;
+    environment = {
+      SPARK_CONF_DIR = sparkConfDir;
+      SPARK_LOG_DIR = "/var/log/spark";
+    };
+    serviceConfig = {
+      Type = "forking";
+      User = "spark";
+      Group = "spark";
+      WorkingDirectory = "${pkgs.spark}/lib/${pkgs.spark.untarDir}";
+      ExecStart = "${pkgs.spark}/lib/${pkgs.spark.untarDir}/sbin/start-history-server.sh";
+      ExecStop = "${pkgs.spark}/lib/${pkgs.spark.untarDir}/sbin/stop-history-server.sh";
+      TimeoutSec = 300;
+      StartLimitBurst = 10;
+      Restart = "always";
+    };
+  };
+
+}
diff --git a/nixos/thor.nix b/nixos/thor.nix
index a2a8371..7b41f46 100644
--- a/nixos/thor.nix
+++ b/nixos/thor.nix
@@ -4,51 +4,13 @@
 
 { config, pkgs, lib, ... }:
 
-let sparkConfDir = pkgs.stdenv.mkDerivation {
-  name = "spark-config";
-  dontUnpack = true;
-  installPhase = ''
-    # source standard environment
-    . $stdenv/setup
-
-    # shorthands
-    base_conf=${pkgs.spark}/lib/${pkgs.spark.untarDir}/conf/
-
-    # create output dirs for new derivation
-    mkdir -p $out/
-
-    # link unchanged files from the original gnome-session
-    for f in $base_conf/*.template ; do
-       ln -sf $f $out/
-    done
-
-    # change selected files
-    cp $out/log4j.properties{.template,}
-
-    cat > $out/spark-env.sh <<- STOP
-    export JAVA_HOME="${pkgs.jdk8}"
-    export SPARK_HOME="${pkgs.spark}/lib/${pkgs.spark.untarDir}"
-    export SPARK_DIST_CLASSPATH=$(${pkgs.hadoop}/bin/hadoop classpath)
-    export PYSPARK_PYTHON="${pkgs.python3Packages.python}/bin/${pkgs.python3Packages.python.executable}"
-    export PYTHONPATH="\$PYTHONPATH:$PYTHONPATH"
-    export SPARKR_R_SHELL="${pkgs.R}/bin/R"
-    export PATH="\$PATH:${pkgs.R}/bin"
-    STOP
-
-    cat > $out/spark-defaults.conf <<- STOP
-    spark.eventLog.enabled          		true
-    spark.eventLog.dir              		hdfs://localhost:/logs/spark
-    spark.history.fs.logDirectory   		hdfs://localhost:/logs/spark
-    STOP
-  '';
-};
-in
 {
   imports = [
     <nixos-hardware/common/cpu/amd>
     <nixos-hardware/common/pc/ssd>
     /etc/nixos/hardware-configuration.nix
     ./pro_audio.nix
+    ./big_data.nix
   ];
 
   boot = {
@@ -195,59 +157,6 @@ in
     # gnome.gnome-remote-desktop.enable = true;
     zerotierone = { enable = true; joinNetworks = [ "8056c2e21cf9c753" ]; };
 
-
-
-    spark = {
-      master = {
-        enable = true;
-        restartIfChanged = true;
-      };
-      worker = {
-        enable = true;
-        restartIfChanged = true;
-      };
-      confDir = sparkConfDir;
-    };
-
-    hadoop = {
-      coreSite = {
-        "fs.defaultFS" = "hdfs://localhost:8020";
-      };
-      hdfsSite = {
-        "dfs.namenode.rpc-bind-host" = "0.0.0.0";
-        "dfs.permissions" = "false";
-      };
-
-      hdfs = {
-        namenode = {
-          enable = true;
-          formatOnInit = true;
-          restartIfChanged = true;
-        };
-        datanode = {
-          enable = true;
-          restartIfChanged = true;
-        };
-        journalnode = {
-          enable = true;
-          restartIfChanged = true;
-        };
-        zkfc = {
-          enable = true;
-          restartIfChanged = true;
-        };
-        httpfs = {
-          enable = true;
-          restartIfChanged = true;
-        };
-      };
-      yarn = {
-        resourcemanager.enable = true;
-        nodemanager.enable = true;
-      };
-    };
-
-
     ethminer = {
       enable = false;
       wallet = "0x73b788882e1C182123333f42FFf275B7dd7f51bb";

From 2bcd4152a991f6e25f4ec15d38d093322c94d5f5 Mon Sep 17 00:00:00 2001
From: Filippo Berto <berto.f@protonmail.com>
Date: Mon, 21 Mar 2022 12:21:22 +0100
Subject: [PATCH 05/12] Basic kerberos config

---
 nixos/big_data.nix | 36 ++++++++++++++++++++++++++++++++++++
 odin.nix           |  1 -
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/nixos/big_data.nix b/nixos/big_data.nix
index 3d61957..44e8aeb 100644
--- a/nixos/big_data.nix
+++ b/nixos/big_data.nix
@@ -92,8 +92,44 @@ in
         nodemanager.enable = true;
       };
     };
+
+    kerberos_server = {
+      enable = true;
+      realms."ATHENA.MIT.EDU" = {
+        acl = [
+          { access = "all"; principal = "*/admin"; }
+          { access = "all"; principal = "admin"; }
+        ];
+      };
+    };
   };
 
+  krb5 = {
+    enable = true;
+    realms."ATHENA.MIT.EDU" = {
+      admin_server = "localhost";
+      kdc = [
+        "localhost"
+      ];
+      kpasswd_server = "localhost";
+    };
+    domain_realm = {
+      ".athena.mit.edu" = "ATHENA.MIT.EDU";
+      "athena.mit.edu" = "ATHENA.MIT.EDU";
+    };
+    libdefaults = {
+      default_realm = "ATHENA.MIT.EDU";
+      dns_lookup_realm = false;
+      dns_lookup_kdc = false;
+    };
+    extraConfig = ''
+      [logging]
+        default = FILE:/var/log/krb5.log
+    '';
+  };
+
+
+
   systemd.services.spark-history = {
     path = with pkgs; [ procps openssh nettools ];
     description = "spark history service.";
diff --git a/odin.nix b/odin.nix
index c5df576..528bf04 100644
--- a/odin.nix
+++ b/odin.nix
@@ -45,7 +45,6 @@
       ark
       authy
       bitwarden
-      blender
       btop
       catgirl # IRC
       cava

From 7ad8a748f1b4fce010d8f4d35f8be94ed4fc1a78 Mon Sep 17 00:00:00 2001
From: Filippo Berto <berto.f@protonmail.com>
Date: Thu, 24 Mar 2022 09:36:46 +0100
Subject: [PATCH 06/12] Kerberos config

---
 nixos/big_data.nix | 211 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 160 insertions(+), 51 deletions(-)

diff --git a/nixos/big_data.nix b/nixos/big_data.nix
index 44e8aeb..d4d161e 100644
--- a/nixos/big_data.nix
+++ b/nixos/big_data.nix
@@ -40,6 +40,17 @@ let sparkConfDir = pkgs.stdenv.mkDerivation {
 in
 {
 
+  networking = {
+    hosts = {
+      "127.0.0.1" = [
+        "ds.my.engine"
+        "kdc.my.engine"
+        "my.engine"
+      ];
+    };
+
+  };
+
   services = {
     spark = {
       master = {
@@ -53,83 +64,181 @@ in
       confDir = sparkConfDir;
     };
 
-    hadoop = {
-      coreSite = {
-        "fs.defaultFS" = "hdfs://localhost:8020";
-      };
-      hdfsSite = {
-        "dfs.namenode.rpc-bind-host" = "0.0.0.0";
-        "dfs.permissions" = "false";
-        "dfs.namenode.name.dir" = "/hdfs/dfs/name";
-        "dfs.datanode.data.dir" = "/hdfs/dfs/data";
-      };
 
-      hdfs = {
-        namenode = {
-          enable = true;
-          formatOnInit = true;
-          restartIfChanged = true;
+
+    hadoop =
+      let
+        keytab_path = /etc/hadoop.keytab;
+      in
+
+      {
+        coreSite = {
+          # "fs.defaultFS" = "hdfs://0.0.0.0:8020";
+
+          # "hadoop.http.authentication.simple.anonymous.allowed" = "false";
+          # "hadoop.http.authentication.signature.secret.file" = "/var/lib/hadoop/security/http_secret";
+          # "hadoop.http.authentication.type" = "kerberos";
+          # "hadoop.http.authentication.kerberos.principal" = "http/my.engine@MY.ENGINE";
+          # "hadoop.http.authentication.cookie.domain" = "my.engine";
+
+          # "hadoop.security.authentication" = "kerberos";
+          # "hadoop.security.authorization" = "true";
+          # "hadoop.rpc.protection" = "authentication";
+
+
+          # "hadoop.rpc.protection" = "authentication";
+          # "hadoop.security.auth_to_local" = ''
+          #   RULE:[2:$1/$2@$0]([ndj]n/.*@MY.ENGINE)s/.*/hdfs/
+          #   RULE:[2:$1/$2@$0]([rn]m/.*@MY.ENGINE)s/.*/yarn/
+          #   RULE:[2:$1/$2@$0](jhs/.*@MY.ENGINE)s/.*/mapred/
+          #   DEFAULT
+          # '';
+          # "hadoop.proxyuser.superuser.hosts" = "*"; # TODO: restrict
+          # "hadoop.proxyuser.superuser.groups" = "*"; # TODO: restrict
+
+          "fs.defaultFS" = "hdfs://my.engine:8020";
+
+          # HDFS IMPERSONATION
+          "hadoop.proxyuser.hdfs.hosts" = "*";
+          "hadoop.proxyuser.hdfs.groups" = "*";
+
+          # HIVE IMPERSONATION
+          "hadoop.proxyuser.hive.hosts" = "*";
+          "hadoop.proxyuser.hive.groups" = "*";
+
+          # ENABLE AUTHENTICATION
+          "hadoop.security.authentication" = "kerberos";
+          "hadoop.security.authorization" = "true";
+          "hadoop.rpc.protection" = "privacy";
+
+          "hadoop.security.auth_to_local" = ''
+            RULE:[2:$1/$2@$0]([ndj]n/.*@MY\.ENGINE)s/.*/hdfs/
+            RULE:[2:$1/$2@$0]([rn]m/.*@MY\.ENGINE)s/.*/yarn/
+            RULE:[2:$1/$2@$0](jhs/.*@MY\.ENGINE)s/.*/mapred/
+            DEFAULT
+          '';
         };
-        datanode = {
-          enable = true;
-          restartIfChanged = true;
+        hdfsSite = {
+          # DATA
+          "dfs.namenode.name.dir" = "/hdfs/dfs/name";
+          "dfs.datanode.data.dir" = "/hdfs/dfs/data";
+
+          # HDFS SECURITY
+          "dfs.block.access.token.enable" = "true";
+
+          # NAME NODE SECURITY
+          "dfs.namenode.keytab.file" = keytab_path;
+          "dfs.namenode.kerberos.principal" = "nn/my.engine@MY.ENGINE";
+          "dfs.namenode.kerberos.internal.spnego.principal" = "HTTP/my.engine@MY.ENGINE";
+
+          # SECONDARY NAME NODE SECURITY
+          "dfs.secondary.namenode.keytab.file" = keytab_path;
+          "dfs.secondary.namenode.kerberos.principal" = "nn/my.engine@MY.ENGINE";
+          "dfs.secondary.namenode.kerberos.internal.spnego.principal" = "HTTP/my.engine@MY.ENGINE";
+
+          # DATA NODE SECURITY
+          "dfs.datanode.keytab.file" = keytab_path;
+          "dfs.datanode.kerberos.principal" = "dn/my.engine@MY.ENGINE";
+
+          # WEBHDFS SECURITY
+          "dfs.webhdfs.enabled" = "true";
+
+          # WEB AUTHENTICATION CONFIG
+          "dfs.web.authentication.kerberos.principal" = "HTTP/my.engine@MY.ENGINE";
+          "dfs.web.authentication.kerberos.keytab" = keytab_path;
+          "ignore.secure.ports.for.testing" = "true";
+          "dfs.http.policy" = "HTTP_ONLY";
+          "dfs.data.transfer.protection" = "privacy";
+
+          # ## MULTIHOMED
+          # "dfs.namenode.rpc-bind-host" = "0.0.0.0";
+          # "dfs.namenode.servicerpc-bind-host" = "0.0.0.0";
+          # "dfs.namenode.http-bind-host" = "0.0.0.0";
+          # "dfs.namenode.https-bind-host" = "0.0.0.0";
+          # "dfs.client.use.datanode.hostname" = "true"; # force connection by hostname
+          # "dfs.datanode.use.datanode.hostname" = "true"; # force connection by hostname
+
+
+          # "dfs.data.transfer.protection" = "privacy";
+          # "hadoop.rpc.protection" = "privacy";
+          # "dfs.http.policy" = "HTTP_ONLY";
+          # "dfs.datanode.address" = "0.0.0.0:10019";
+          # "dfs.datanode.http.address" = "0.0.0.0:10022";
+          # "dfs.datanode.https.address" = "0.0.0.0:10023";
+
+
+          # "dfs.datanode.kerberos.principal" = "dn/my.engine@MY.ENGINE";
+          # "dfs.datanode.keytab.file" = keytab_path;
+
+          # "dfs.namenode.kerberos.principal" = "nn/my.engine@MY.ENGINE";
+          # "dfs.namenode.keytab.file" = keytab_path;
+
+          # "dfs.block.access.token.enable" = "true";
+
+
         };
-        journalnode = {
-          enable = true;
-          restartIfChanged = true;
+        yarnSite = {
+          # "yarn.acl.enable" = "true";
+          # "yarn.admin.acl" = "*"; # TODO: restrict
         };
-        zkfc = {
-          enable = true;
-          restartIfChanged = true;
-        };
-        httpfs = {
-          enable = true;
-          restartIfChanged = true;
+        extraConfDirs = [ ];
+
+        hdfs = {
+          namenode = { enable = true; formatOnInit = true; restartIfChanged = true; };
+          datanode = { enable = true; restartIfChanged = true; };
+          journalnode = { enable = true; restartIfChanged = true; };
+          zkfc = { enable = true; restartIfChanged = true; };
+          httpfs = { enable = true; restartIfChanged = true; };
         };
+        yarn = { resourcemanager.enable = true; nodemanager.enable = true; };
       };
-      yarn = {
-        resourcemanager.enable = true;
-        nodemanager.enable = true;
-      };
-    };
 
     kerberos_server = {
       enable = true;
-      realms."ATHENA.MIT.EDU" = {
-        acl = [
-          { access = "all"; principal = "*/admin"; }
-          { access = "all"; principal = "admin"; }
-        ];
-      };
+      realms."MY.ENGINE".acl = [
+        { principal = "*/admin"; access = "all"; }
+        { principal = "admin"; access = "all"; }
+        { principal = "*/localhost"; access = "all"; }
+        { principal = "*/my.engine"; access = "all"; }
+        { principal = "nn/my.engine"; access = "all"; }
+        { principal = "hdfs"; access = "all"; }
+      ];
     };
   };
 
   krb5 = {
     enable = true;
-    realms."ATHENA.MIT.EDU" = {
-      admin_server = "localhost";
-      kdc = [
-        "localhost"
-      ];
-      kpasswd_server = "localhost";
+    realms = {
+      "MY.ENGINE" = {
+        admin_server = "kdc.my.engine";
+        kdc = "kdc.my.engine";
+        # default_domain = "my.engine";
+        # kpasswd_server = "odin";
+      };
     };
     domain_realm = {
-      ".athena.mit.edu" = "ATHENA.MIT.EDU";
-      "athena.mit.edu" = "ATHENA.MIT.EDU";
+      # ".my.engine" = "MY.ENGINE";
+      "my.engine" = "MY.ENGINE";
     };
     libdefaults = {
-      default_realm = "ATHENA.MIT.EDU";
-      dns_lookup_realm = false;
-      dns_lookup_kdc = false;
+      default_realm = "MY.ENGINE";
+      dns_lookup_realm = true;
+      dns_lookup_kdc = true;
+      ticket_lifetime = "24h";
+      renew_lifetime = "7d";
+      forwardable = true;
     };
     extraConfig = ''
       [logging]
-        default = FILE:/var/log/krb5.log
+        default = FILE:/var/log/krb5libs.log
+        kdc = FILE:/var/log/krb5kdc.log
+        admin_server = FILE:/var/log/kadmind.log
     '';
   };
 
 
 
+
   systemd.services.spark-history = {
     path = with pkgs; [ procps openssh nettools ];
     description = "spark history service.";

From a27dbfd6813fe44c72beac93c20e8d40278889f7 Mon Sep 17 00:00:00 2001
From: Filippo Berto <berto.f@protonmail.com>
Date: Thu, 24 Mar 2022 17:51:24 +0100
Subject: [PATCH 07/12] HDSF + KERBEROS

---
 nixos/base.nix     |   3 +-
 nixos/big_data.nix | 300 ++++++++++++++++++++++-----------------------
 2 files changed, 146 insertions(+), 157 deletions(-)

diff --git a/nixos/base.nix b/nixos/base.nix
index 3240307..0b5ded8 100644
--- a/nixos/base.nix
+++ b/nixos/base.nix
@@ -1,6 +1,5 @@
 { config, pkgs, lib, ... }:
 {
-
   imports = [
     <nixos-hardware/common/cpu/intel>
     <nixos-hardware/common/pc/laptop>
@@ -171,7 +170,7 @@
       };
   };
 
-  services.dbus.packages = with pkgs; [ gnome.dconf ];
+  services.dbus.packages = with pkgs; [ dconf ];
   services.gnome.gnome-keyring.enable = true;
   hardware.bluetooth.enable = true;
   # services.blueman.enable = true;
diff --git a/nixos/big_data.nix b/nixos/big_data.nix
index d4d161e..3efcad5 100644
--- a/nixos/big_data.nix
+++ b/nixos/big_data.nix
@@ -1,42 +1,52 @@
 { config, lib, pkgs, ... }:
-let sparkConfDir = pkgs.stdenv.mkDerivation {
-  name = "spark-config";
-  dontUnpack = true;
-  installPhase = ''
-    # source standard environment
-    . $stdenv/setup
+let
+  keytab_path = /etc/hadoop.keytab;
+  hadoopConf = import <nixos/nixos/modules/services/cluster/hadoop/conf.nix> {
+    inherit pkgs lib;
+    cfg = config.services.hadoop;
+  };
+  hadoopConfDir = "${hadoopConf}/";
+  sparkConfDir = pkgs.stdenv.mkDerivation {
+    name = "spark-config";
+    dontUnpack = true;
+    installPhase = ''
+      # source standard environment
+      . $stdenv/setup
 
-    # shorthands
-    base_conf=${pkgs.spark}/lib/${pkgs.spark.untarDir}/conf/
+      # shorthands
+      base_conf=${pkgs.spark}/lib/${pkgs.spark.untarDir}/conf/
 
-    # create output dirs for new derivation
-    mkdir -p $out/
+      # create output dirs for new derivation
+      mkdir -p $out/
 
-    # link unchanged files from the original gnome-session
-    for f in $base_conf/*.template ; do
-       ln -sf $f $out/
-    done
+      # link unchanged files from the original gnome-session
+      for f in $base_conf/*.template ; do
+         ln -sf $f $out/
+      done
 
-    # change selected files
-    cp $out/log4j.properties{.template,}
+      # change selected files
+      cp $out/log4j.properties{.template,}
 
-    cat > $out/spark-env.sh <<- STOP
-    export JAVA_HOME="${pkgs.jdk8}"
-    export SPARK_HOME="${pkgs.spark}/lib/${pkgs.spark.untarDir}"
-    export SPARK_DIST_CLASSPATH=$(${pkgs.hadoop}/bin/hadoop classpath)
-    export PYSPARK_PYTHON="${pkgs.python3Packages.python}/bin/${pkgs.python3Packages.python.executable}"
-    export PYTHONPATH="\$PYTHONPATH:$PYTHONPATH"
-    export SPARKR_R_SHELL="${pkgs.R}/bin/R"
-    export PATH="\$PATH:${pkgs.R}/bin"
-    STOP
+      cat > $out/spark-env.sh <<- STOP
+      export JAVA_HOME="${pkgs.jdk8}"
+      export SPARK_HOME="${pkgs.spark}/lib/${pkgs.spark.untarDir}"
+      export SPARK_DIST_CLASSPATH=$(${pkgs.hadoop}/bin/hadoop classpath)
+      export PYSPARK_PYTHON="${pkgs.python3Packages.python}/bin/${pkgs.python3Packages.python.executable}"
+      export PYTHONPATH="\$PYTHONPATH:$PYTHONPATH"
+      export HADOOP_CONF_DIR="${hadoopConfDir}"
+      export SPARKR_R_SHELL="${pkgs.R}/bin/R"
+      export PATH="\$PATH:${pkgs.R}/bin"
+      STOP
 
-    cat > $out/spark-defaults.conf <<- STOP
-    spark.eventLog.enabled          		true
-    spark.eventLog.dir              		hdfs://localhost:/logs/spark
-    spark.history.fs.logDirectory   		hdfs://localhost:/logs/spark
-    STOP
-  '';
-};
+      cat > $out/spark-defaults.conf <<- STOP
+      spark.eventLog.enabled          		true
+      spark.eventLog.dir              		hdfs://localhost:/logs/spark
+      spark.history.fs.logDirectory   		hdfs://localhost:/logs/spark
+      spark.yarn.keytab                   ${keytab_path}
+      spark.yarn.principal								spark/my.engine@MY.ENGINE
+      STOP
+    '';
+  };
 in
 {
 
@@ -66,132 +76,113 @@ in
 
 
 
-    hadoop =
-      let
-        keytab_path = /etc/hadoop.keytab;
-      in
+    hadoop = {
+      coreSite = {
+        "fs.defaultFS" = "hdfs://my.engine:8020";
 
-      {
-        coreSite = {
-          # "fs.defaultFS" = "hdfs://0.0.0.0:8020";
+        # HDFS IMPERSONATION
+        "hadoop.proxyuser.hdfs.hosts" = "*";
+        "hadoop.proxyuser.hdfs.groups" = "*";
 
-          # "hadoop.http.authentication.simple.anonymous.allowed" = "false";
-          # "hadoop.http.authentication.signature.secret.file" = "/var/lib/hadoop/security/http_secret";
-          # "hadoop.http.authentication.type" = "kerberos";
-          # "hadoop.http.authentication.kerberos.principal" = "http/my.engine@MY.ENGINE";
-          # "hadoop.http.authentication.cookie.domain" = "my.engine";
+        # HIVE IMPERSONATION
+        "hadoop.proxyuser.hive.hosts" = "*";
+        "hadoop.proxyuser.hive.groups" = "*";
 
-          # "hadoop.security.authentication" = "kerberos";
-          # "hadoop.security.authorization" = "true";
-          # "hadoop.rpc.protection" = "authentication";
+        # ENABLE AUTHENTICATION
+        "hadoop.security.authentication" = "kerberos";
+        "hadoop.security.authorization" = "true";
+        "hadoop.rpc.protection" = "privacy";
 
-
-          # "hadoop.rpc.protection" = "authentication";
-          # "hadoop.security.auth_to_local" = ''
-          #   RULE:[2:$1/$2@$0]([ndj]n/.*@MY.ENGINE)s/.*/hdfs/
-          #   RULE:[2:$1/$2@$0]([rn]m/.*@MY.ENGINE)s/.*/yarn/
-          #   RULE:[2:$1/$2@$0](jhs/.*@MY.ENGINE)s/.*/mapred/
-          #   DEFAULT
-          # '';
-          # "hadoop.proxyuser.superuser.hosts" = "*"; # TODO: restrict
-          # "hadoop.proxyuser.superuser.groups" = "*"; # TODO: restrict
-
-          "fs.defaultFS" = "hdfs://my.engine:8020";
-
-          # HDFS IMPERSONATION
-          "hadoop.proxyuser.hdfs.hosts" = "*";
-          "hadoop.proxyuser.hdfs.groups" = "*";
-
-          # HIVE IMPERSONATION
-          "hadoop.proxyuser.hive.hosts" = "*";
-          "hadoop.proxyuser.hive.groups" = "*";
-
-          # ENABLE AUTHENTICATION
-          "hadoop.security.authentication" = "kerberos";
-          "hadoop.security.authorization" = "true";
-          "hadoop.rpc.protection" = "privacy";
-
-          "hadoop.security.auth_to_local" = ''
-            RULE:[2:$1/$2@$0]([ndj]n/.*@MY\.ENGINE)s/.*/hdfs/
-            RULE:[2:$1/$2@$0]([rn]m/.*@MY\.ENGINE)s/.*/yarn/
-            RULE:[2:$1/$2@$0](jhs/.*@MY\.ENGINE)s/.*/mapred/
-            DEFAULT
-          '';
-        };
-        hdfsSite = {
-          # DATA
-          "dfs.namenode.name.dir" = "/hdfs/dfs/name";
-          "dfs.datanode.data.dir" = "/hdfs/dfs/data";
-
-          # HDFS SECURITY
-          "dfs.block.access.token.enable" = "true";
-
-          # NAME NODE SECURITY
-          "dfs.namenode.keytab.file" = keytab_path;
-          "dfs.namenode.kerberos.principal" = "nn/my.engine@MY.ENGINE";
-          "dfs.namenode.kerberos.internal.spnego.principal" = "HTTP/my.engine@MY.ENGINE";
-
-          # SECONDARY NAME NODE SECURITY
-          "dfs.secondary.namenode.keytab.file" = keytab_path;
-          "dfs.secondary.namenode.kerberos.principal" = "nn/my.engine@MY.ENGINE";
-          "dfs.secondary.namenode.kerberos.internal.spnego.principal" = "HTTP/my.engine@MY.ENGINE";
-
-          # DATA NODE SECURITY
-          "dfs.datanode.keytab.file" = keytab_path;
-          "dfs.datanode.kerberos.principal" = "dn/my.engine@MY.ENGINE";
-
-          # WEBHDFS SECURITY
-          "dfs.webhdfs.enabled" = "true";
-
-          # WEB AUTHENTICATION CONFIG
-          "dfs.web.authentication.kerberos.principal" = "HTTP/my.engine@MY.ENGINE";
-          "dfs.web.authentication.kerberos.keytab" = keytab_path;
-          "ignore.secure.ports.for.testing" = "true";
-          "dfs.http.policy" = "HTTP_ONLY";
-          "dfs.data.transfer.protection" = "privacy";
-
-          # ## MULTIHOMED
-          # "dfs.namenode.rpc-bind-host" = "0.0.0.0";
-          # "dfs.namenode.servicerpc-bind-host" = "0.0.0.0";
-          # "dfs.namenode.http-bind-host" = "0.0.0.0";
-          # "dfs.namenode.https-bind-host" = "0.0.0.0";
-          # "dfs.client.use.datanode.hostname" = "true"; # force connection by hostname
-          # "dfs.datanode.use.datanode.hostname" = "true"; # force connection by hostname
-
-
-          # "dfs.data.transfer.protection" = "privacy";
-          # "hadoop.rpc.protection" = "privacy";
-          # "dfs.http.policy" = "HTTP_ONLY";
-          # "dfs.datanode.address" = "0.0.0.0:10019";
-          # "dfs.datanode.http.address" = "0.0.0.0:10022";
-          # "dfs.datanode.https.address" = "0.0.0.0:10023";
-
-
-          # "dfs.datanode.kerberos.principal" = "dn/my.engine@MY.ENGINE";
-          # "dfs.datanode.keytab.file" = keytab_path;
-
-          # "dfs.namenode.kerberos.principal" = "nn/my.engine@MY.ENGINE";
-          # "dfs.namenode.keytab.file" = keytab_path;
-
-          # "dfs.block.access.token.enable" = "true";
-
-
-        };
-        yarnSite = {
-          # "yarn.acl.enable" = "true";
-          # "yarn.admin.acl" = "*"; # TODO: restrict
-        };
-        extraConfDirs = [ ];
-
-        hdfs = {
-          namenode = { enable = true; formatOnInit = true; restartIfChanged = true; };
-          datanode = { enable = true; restartIfChanged = true; };
-          journalnode = { enable = true; restartIfChanged = true; };
-          zkfc = { enable = true; restartIfChanged = true; };
-          httpfs = { enable = true; restartIfChanged = true; };
-        };
-        yarn = { resourcemanager.enable = true; nodemanager.enable = true; };
+        "hadoop.security.auth_to_local" = ''
+          RULE:[2:$1/$2@$0]([ndj]n/.*@MY\.ENGINE)s/.*/hdfs/
+          RULE:[2:$1/$2@$0]([rn]m/.*@MY\.ENGINE)s/.*/yarn/
+          RULE:[2:$1/$2@$0](jhs/.*@MY\.ENGINE)s/.*/mapred/
+          DEFAULT
+        '';
       };
+      hdfsSite = {
+        # DATA
+        "dfs.namenode.name.dir" = "/hdfs/dfs/name";
+        "dfs.datanode.data.dir" = "/hdfs/dfs/data";
+
+        # HDFS SECURITY
+        "dfs.block.access.token.enable" = "true";
+
+        # NAME NODE SECURITY
+        "dfs.namenode.keytab.file" = keytab_path;
+        "dfs.namenode.kerberos.principal" = "nn/my.engine@MY.ENGINE";
+        "dfs.namenode.kerberos.internal.spnego.principal" = "HTTP/my.engine@MY.ENGINE";
+
+        # SECONDARY NAME NODE SECURITY
+        "dfs.secondary.namenode.keytab.file" = keytab_path;
+        "dfs.secondary.namenode.kerberos.principal" = "nn/my.engine@MY.ENGINE";
+        "dfs.secondary.namenode.kerberos.internal.spnego.principal" = "HTTP/my.engine@MY.ENGINE";
+
+        # DATA NODE SECURITY
+        "dfs.datanode.keytab.file" = keytab_path;
+        "dfs.datanode.kerberos.principal" = "dn/my.engine@MY.ENGINE";
+
+        # JOURNAL NODE SECURITY
+        "dfs.journalnode.keytab.file" = keytab_path;
+        "dfs.journalnode.kerberos.principal" = "jn/my.engine@MY.ENGINE";
+
+        # WEBHDFS SECURITY
+        "dfs.webhdfs.enabled" = "true";
+
+        # WEB AUTHENTICATION CONFIG
+        "dfs.web.authentication.kerberos.principal" = "HTTP/my.engine@MY.ENGINE";
+        "dfs.web.authentication.kerberos.keytab" = keytab_path;
+        "ignore.secure.ports.for.testing" = "true";
+        "dfs.http.policy" = "HTTP_ONLY";
+        "dfs.data.transfer.protection" = "privacy";
+
+        # ## MULTIHOMED
+        # "dfs.namenode.rpc-bind-host" = "0.0.0.0";
+        # "dfs.namenode.servicerpc-bind-host" = "0.0.0.0";
+        # "dfs.namenode.http-bind-host" = "0.0.0.0";
+        # "dfs.namenode.https-bind-host" = "0.0.0.0";
+        # "dfs.client.use.datanode.hostname" = "true"; # force connection by hostname
+        # "dfs.datanode.use.datanode.hostname" = "true"; # force connection by hostname
+      };
+      yarnSite = {
+        "yarn.nodemanager.admin-env" = "PATH=$PATH";
+        "yarn.nodemanager.aux-services" = "mapreduce_shuffle";
+        "yarn.nodemanager.aux-services.mapreduce_shuffle.class" = "org.apache.hadoop.mapred.ShuffleHandler";
+        "yarn.nodemanager.bind-host" = "0.0.0.0";
+        "yarn.nodemanager.container-executor.class" = "org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor";
+        "yarn.nodemanager.env-whitelist" = "JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_HOME,LANG,TZ";
+        "yarn.nodemanager.linux-container-executor.group" = "hadoop";
+        "yarn.nodemanager.linux-container-executor.path" = "/run/wrappers/yarn-nodemanager/bin/container-executor";
+        "yarn.nodemanager.log-dirs" = "/var/log/hadoop/yarn/nodemanager";
+        "yarn.resourcemanager.bind-host" = "0.0.0.0";
+        "yarn.resourcemanager.scheduler.class" = "org.apache.hadoop.yarn.server.resourcemanager.scheduler.fifo.FifoScheduler";
+
+        "yarn.resourcemanager.keytab" = keytab_path;
+        "yarn.resourcemanager.principal" = "rm/my.engine@MY.ENGINE";
+        "yarn.nodemanager.keytab" = keytab_path;
+        "yarn.nodemanager.principal" = "nm/my.engine@MY.ENGINE";
+
+        # "yarn.nodemanager.container-executor.class" = "org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor";
+
+        "yarn.scheduler.capacity.root.queues" = "default";
+        "yarn.scheduler.capacity.root.default.capacity" = 100;
+        # "yarn.scheduler.capacity.root.default.state" = "RUNNING";
+        "yarn.scheduler.capacity.root.acl_submit_applications" = "hadoop,yarn,mapred,hdfs";
+      };
+      extraConfDirs = [ ];
+
+      hdfs = {
+        namenode = { enable = true; formatOnInit = true; restartIfChanged = true; };
+        datanode = { enable = true; restartIfChanged = true; };
+        journalnode = { enable = true; restartIfChanged = true; };
+        zkfc = { enable = true; restartIfChanged = true; };
+        httpfs = { enable = true; restartIfChanged = true; };
+      };
+      yarn = {
+        resourcemanager = { enable = true; restartIfChanged = true; };
+        nodemanager = { enable = true; restartIfChanged = true; };
+      };
+    };
 
     kerberos_server = {
       enable = true;
@@ -236,8 +227,7 @@ in
     '';
   };
 
-
-
+  users.users.bertof.extraGroups = [ "hadoop" ];
 
   systemd.services.spark-history = {
     path = with pkgs; [ procps openssh nettools ];

From 86a6fbac7679fe517a4d9eb7863de5b08d003624 Mon Sep 17 00:00:00 2001
From: Filippo Berto <berto.f@protonmail.com>
Date: Tue, 29 Mar 2022 09:47:57 +0200
Subject: [PATCH 08/12] Update big data config

---
 nixos/big_data.nix | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/nixos/big_data.nix b/nixos/big_data.nix
index 3efcad5..7566112 100644
--- a/nixos/big_data.nix
+++ b/nixos/big_data.nix
@@ -1,11 +1,16 @@
 { config, lib, pkgs, ... }:
 let
   keytab_path = /etc/hadoop.keytab;
+  pysparkPackageSelector = p: with p; [ numpy pyspark ];
+  pysparkEnv = pkgs.python3.withPackages pysparkPackageSelector;
   hadoopConf = import <nixos/nixos/modules/services/cluster/hadoop/conf.nix> {
     inherit pkgs lib;
     cfg = config.services.hadoop;
   };
   hadoopConfDir = "${hadoopConf}/";
+  spark = pkgs.spark.override {
+    extraPythonPackages = pysparkPackageSelector pkgs.python3.pkgs;
+  };
   sparkConfDir = pkgs.stdenv.mkDerivation {
     name = "spark-config";
     dontUnpack = true;
@@ -31,7 +36,8 @@ let
       export JAVA_HOME="${pkgs.jdk8}"
       export SPARK_HOME="${pkgs.spark}/lib/${pkgs.spark.untarDir}"
       export SPARK_DIST_CLASSPATH=$(${pkgs.hadoop}/bin/hadoop classpath)
-      export PYSPARK_PYTHON="${pkgs.python3Packages.python}/bin/${pkgs.python3Packages.python.executable}"
+      export PYSPARK_PYTHON="${pysparkEnv.outPath}/bin/${pysparkEnv.executable}:"
+      export PYSPARK_DRIVER_PYTHON="${pysparkEnv.outPath}/bin/${pysparkEnv.executable}:"
       export PYTHONPATH="\$PYTHONPATH:$PYTHONPATH"
       export HADOOP_CONF_DIR="${hadoopConfDir}"
       export SPARKR_R_SHELL="${pkgs.R}/bin/R"
@@ -42,8 +48,8 @@ let
       spark.eventLog.enabled          		true
       spark.eventLog.dir              		hdfs://localhost:/logs/spark
       spark.history.fs.logDirectory   		hdfs://localhost:/logs/spark
-      spark.yarn.keytab                   ${keytab_path}
-      spark.yarn.principal								spark/my.engine@MY.ENGINE
+      # spark.yarn.keytab                   ${keytab_path}
+      # spark.yarn.principal								spark/my.engine@MY.ENGINE
       STOP
     '';
   };
@@ -63,19 +69,12 @@ in
 
   services = {
     spark = {
-      master = {
-        enable = true;
-        restartIfChanged = true;
-      };
-      worker = {
-        enable = true;
-        restartIfChanged = true;
-      };
+      package = spark;
+      master = { enable = true; restartIfChanged = true; };
+      worker = { enable = true; restartIfChanged = true; };
       confDir = sparkConfDir;
     };
 
-
-
     hadoop = {
       coreSite = {
         "fs.defaultFS" = "hdfs://my.engine:8020";
@@ -188,11 +187,7 @@ in
       enable = true;
       realms."MY.ENGINE".acl = [
         { principal = "*/admin"; access = "all"; }
-        { principal = "admin"; access = "all"; }
-        { principal = "*/localhost"; access = "all"; }
         { principal = "*/my.engine"; access = "all"; }
-        { principal = "nn/my.engine"; access = "all"; }
-        { principal = "hdfs"; access = "all"; }
       ];
     };
   };

From 52047104cc40f0feeab90eb33ffd9fd9755f5efa Mon Sep 17 00:00:00 2001
From: Filippo Berto <berto.f@protonmail.com>
Date: Wed, 30 Mar 2022 09:35:53 +0200
Subject: [PATCH 09/12] Fix spark python + spark history kerberos

---
 nixos/big_data.nix | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/nixos/big_data.nix b/nixos/big_data.nix
index 7566112..eef0133 100644
--- a/nixos/big_data.nix
+++ b/nixos/big_data.nix
@@ -12,7 +12,7 @@ let
     extraPythonPackages = pysparkPackageSelector pkgs.python3.pkgs;
   };
   sparkConfDir = pkgs.stdenv.mkDerivation {
-    name = "spark-config";
+    name = "spark-conf";
     dontUnpack = true;
     installPhase = ''
       # source standard environment
@@ -36,8 +36,8 @@ let
       export JAVA_HOME="${pkgs.jdk8}"
       export SPARK_HOME="${pkgs.spark}/lib/${pkgs.spark.untarDir}"
       export SPARK_DIST_CLASSPATH=$(${pkgs.hadoop}/bin/hadoop classpath)
-      export PYSPARK_PYTHON="${pysparkEnv.outPath}/bin/${pysparkEnv.executable}:"
-      export PYSPARK_DRIVER_PYTHON="${pysparkEnv.outPath}/bin/${pysparkEnv.executable}:"
+      export PYSPARK_PYTHON="${pysparkEnv.outPath}/bin/${pysparkEnv.executable}"
+      export PYSPARK_DRIVER_PYTHON="${pysparkEnv.outPath}/bin/${pysparkEnv.executable}"
       export PYTHONPATH="\$PYTHONPATH:$PYTHONPATH"
       export HADOOP_CONF_DIR="${hadoopConfDir}"
       export SPARKR_R_SHELL="${pkgs.R}/bin/R"
@@ -45,11 +45,17 @@ let
       STOP
 
       cat > $out/spark-defaults.conf <<- STOP
-      spark.eventLog.enabled          		true
-      spark.eventLog.dir              		hdfs://localhost:/logs/spark
-      spark.history.fs.logDirectory   		hdfs://localhost:/logs/spark
-      # spark.yarn.keytab                   ${keytab_path}
-      # spark.yarn.principal								spark/my.engine@MY.ENGINE
+      spark.eventLog.enabled									true
+      spark.eventLog.dir											hdfs://localhost:/logs/spark
+      spark.history.fs.logDirectory						hdfs://localhost:/logs/spark
+      # spark.yarn.keytab											${keytab_path}
+      # spark.yarn.principal									spark/my.engine@MY.ENGINE
+      spark.history.ui.acls.enable						true
+      spark.history.kerberos.enabled					true
+      spark.history.kerberos.keytab						${keytab_path}
+      spark.history.kerberos.principal				spark/my.engine@MY.ENGINE
+      spark.yarn.appMasterEnv.PYSPARK_PYTHON	${pysparkEnv.outPath}/bin/${pysparkEnv.executable}
+      spark.yarn.appMasterEnv.PYTHONPATH			${pysparkEnv.outPath}/lib/${pysparkEnv.executable}/site-packages
       STOP
     '';
   };
@@ -174,7 +180,7 @@ in
         namenode = { enable = true; formatOnInit = true; restartIfChanged = true; };
         datanode = { enable = true; restartIfChanged = true; };
         journalnode = { enable = true; restartIfChanged = true; };
-        zkfc = { enable = true; restartIfChanged = true; };
+        zkfc = { enable = false; restartIfChanged = true; }; # ZOOKEEPER DISABLED, not using High Availability setup
         httpfs = { enable = true; restartIfChanged = true; };
       };
       yarn = {

From 10c2df06ec13f1d27a201ec69329adb1e8e1d4da Mon Sep 17 00:00:00 2001
From: Filippo Berto <berto.f@protonmail.com>
Date: Wed, 30 Mar 2022 16:15:30 +0200
Subject: [PATCH 10/12] Update big data

---
 nixos/big_data.nix | 39 +++++++++++++++++++++++++++++----------
 nixos/thor.nix     | 23 -----------------------
 2 files changed, 29 insertions(+), 33 deletions(-)

diff --git a/nixos/big_data.nix b/nixos/big_data.nix
index eef0133..f77009e 100644
--- a/nixos/big_data.nix
+++ b/nixos/big_data.nix
@@ -1,6 +1,24 @@
 { config, lib, pkgs, ... }:
 let
-  keytab_path = /etc/hadoop.keytab;
+  setup_scirpt = ''
+    sudo mkdir -p /hdfs
+    sudo chown -R hdfs:hadoop /hdfs
+
+    for p in {nn,dn,jn,rm,nm,jhs,HTTP}; do
+    	sudo kadmin.local -q "ank -randkey $p/my.engine";
+    	sudo kadmin.local -q "xst -k /etc/hadoop.keytab $p/my.engine";
+    	sudo kadmin.local -q "ktrem -k /etc/hadoop.keytab $p/my.engine old"
+    done
+    sudo chown hdfs:hadoop /etc/hadoop.keytab
+
+
+    sudo kadmin.local -q "ank -randkey spark/my.engine";
+    sudo kadmin.local -q "xst -k /etc/spark.keytab spark/my.engine";
+    sudo kadmin.local -q "ktrem -k /etc/spark.keytab spark/my.engine old"
+    sudo chown spark:spark /etc/spark.keytab
+  '';
+  hadoop_keytab_path = "/etc/hadoop.keytab";
+  spark_keytab_path = "/etc/spark.keytab";
   pysparkPackageSelector = p: with p; [ numpy pyspark ];
   pysparkEnv = pkgs.python3.withPackages pysparkPackageSelector;
   hadoopConf = import <nixos/nixos/modules/services/cluster/hadoop/conf.nix> {
@@ -48,11 +66,11 @@ let
       spark.eventLog.enabled									true
       spark.eventLog.dir											hdfs://localhost:/logs/spark
       spark.history.fs.logDirectory						hdfs://localhost:/logs/spark
-      # spark.yarn.keytab											${keytab_path}
+      # spark.yarn.keytab											${hadoop_keytab_path}
       # spark.yarn.principal									spark/my.engine@MY.ENGINE
       spark.history.ui.acls.enable						true
       spark.history.kerberos.enabled					true
-      spark.history.kerberos.keytab						${keytab_path}
+      spark.history.kerberos.keytab						${hadoop_keytab_path}
       spark.history.kerberos.principal				spark/my.engine@MY.ENGINE
       spark.yarn.appMasterEnv.PYSPARK_PYTHON	${pysparkEnv.outPath}/bin/${pysparkEnv.executable}
       spark.yarn.appMasterEnv.PYTHONPATH			${pysparkEnv.outPath}/lib/${pysparkEnv.executable}/site-packages
@@ -109,26 +127,27 @@ in
         # DATA
         "dfs.namenode.name.dir" = "/hdfs/dfs/name";
         "dfs.datanode.data.dir" = "/hdfs/dfs/data";
+        "dfs.journalnode.edits.dir" = "/hdfs/dfs/edits";
 
         # HDFS SECURITY
         "dfs.block.access.token.enable" = "true";
 
         # NAME NODE SECURITY
-        "dfs.namenode.keytab.file" = keytab_path;
+        "dfs.namenode.keytab.file" = hadoop_keytab_path;
         "dfs.namenode.kerberos.principal" = "nn/my.engine@MY.ENGINE";
         "dfs.namenode.kerberos.internal.spnego.principal" = "HTTP/my.engine@MY.ENGINE";
 
         # SECONDARY NAME NODE SECURITY
-        "dfs.secondary.namenode.keytab.file" = keytab_path;
+        "dfs.secondary.namenode.keytab.file" = hadoop_keytab_path;
         "dfs.secondary.namenode.kerberos.principal" = "nn/my.engine@MY.ENGINE";
         "dfs.secondary.namenode.kerberos.internal.spnego.principal" = "HTTP/my.engine@MY.ENGINE";
 
         # DATA NODE SECURITY
-        "dfs.datanode.keytab.file" = keytab_path;
+        "dfs.datanode.keytab.file" = hadoop_keytab_path;
         "dfs.datanode.kerberos.principal" = "dn/my.engine@MY.ENGINE";
 
         # JOURNAL NODE SECURITY
-        "dfs.journalnode.keytab.file" = keytab_path;
+        "dfs.journalnode.keytab.file" = hadoop_keytab_path;
         "dfs.journalnode.kerberos.principal" = "jn/my.engine@MY.ENGINE";
 
         # WEBHDFS SECURITY
@@ -136,7 +155,7 @@ in
 
         # WEB AUTHENTICATION CONFIG
         "dfs.web.authentication.kerberos.principal" = "HTTP/my.engine@MY.ENGINE";
-        "dfs.web.authentication.kerberos.keytab" = keytab_path;
+        "dfs.web.authentication.kerberos.keytab" = hadoop_keytab_path;
         "ignore.secure.ports.for.testing" = "true";
         "dfs.http.policy" = "HTTP_ONLY";
         "dfs.data.transfer.protection" = "privacy";
@@ -162,9 +181,9 @@ in
         "yarn.resourcemanager.bind-host" = "0.0.0.0";
         "yarn.resourcemanager.scheduler.class" = "org.apache.hadoop.yarn.server.resourcemanager.scheduler.fifo.FifoScheduler";
 
-        "yarn.resourcemanager.keytab" = keytab_path;
+        "yarn.resourcemanager.keytab" = hadoop_keytab_path;
         "yarn.resourcemanager.principal" = "rm/my.engine@MY.ENGINE";
-        "yarn.nodemanager.keytab" = keytab_path;
+        "yarn.nodemanager.keytab" = hadoop_keytab_path;
         "yarn.nodemanager.principal" = "nm/my.engine@MY.ENGINE";
 
         # "yarn.nodemanager.container-executor.class" = "org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor";
diff --git a/nixos/thor.nix b/nixos/thor.nix
index 7b41f46..e681db1 100644
--- a/nixos/thor.nix
+++ b/nixos/thor.nix
@@ -172,29 +172,6 @@
     # teamviewer.enable = true;
   };
 
-  systemd.services.spark-history = {
-    path = with pkgs; [ procps openssh nettools ];
-    description = "spark history service.";
-    after = [ "network.target" ];
-    wantedBy = [ "multi-user.target" ];
-    restartIfChanged = true;
-    environment = {
-      SPARK_CONF_DIR = sparkConfDir;
-      SPARK_LOG_DIR = "/var/log/spark";
-    };
-    serviceConfig = {
-      Type = "forking";
-      User = "spark";
-      Group = "spark";
-      WorkingDirectory = "${pkgs.spark}/lib/${pkgs.spark.untarDir}";
-      ExecStart = "${pkgs.spark}/lib/${pkgs.spark.untarDir}/sbin/start-history-server.sh";
-      ExecStop = "${pkgs.spark}/lib/${pkgs.spark.untarDir}/sbin/stop-history-server.sh";
-      TimeoutSec = 300;
-      StartLimitBurst = 10;
-      Restart = "always";
-    };
-  };
-
   services.teamviewer.enable = true;
 
   security = {

From 2494d1e846e11d8a4a2f1e28c71ae8fd366705e1 Mon Sep 17 00:00:00 2001
From: Filippo Berto <berto.f@protonmail.com>
Date: Thu, 31 Mar 2022 10:40:39 +0200
Subject: [PATCH 11/12] Fix spark keytab pathFix spark keytab path

---
 nixos/big_data.nix | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nixos/big_data.nix b/nixos/big_data.nix
index f77009e..7e30fad 100644
--- a/nixos/big_data.nix
+++ b/nixos/big_data.nix
@@ -66,11 +66,11 @@ let
       spark.eventLog.enabled									true
       spark.eventLog.dir											hdfs://localhost:/logs/spark
       spark.history.fs.logDirectory						hdfs://localhost:/logs/spark
-      # spark.yarn.keytab											${hadoop_keytab_path}
+      # spark.yarn.keytab											${spark_keytab_path}
       # spark.yarn.principal									spark/my.engine@MY.ENGINE
       spark.history.ui.acls.enable						true
       spark.history.kerberos.enabled					true
-      spark.history.kerberos.keytab						${hadoop_keytab_path}
+      spark.history.kerberos.keytab						${spark_keytab_path}
       spark.history.kerberos.principal				spark/my.engine@MY.ENGINE
       spark.yarn.appMasterEnv.PYSPARK_PYTHON	${pysparkEnv.outPath}/bin/${pysparkEnv.executable}
       spark.yarn.appMasterEnv.PYTHONPATH			${pysparkEnv.outPath}/lib/${pysparkEnv.executable}/site-packages

From 742e6625bce86752325e7bb69b84431e932cd3f9 Mon Sep 17 00:00:00 2001
From: Filippo Berto <berto.f@protonmail.com>
Date: Wed, 13 Apr 2022 19:23:32 +0200
Subject: [PATCH 12/12] Fix python path

---
 nixos/big_data.nix | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nixos/big_data.nix b/nixos/big_data.nix
index 7e30fad..a734dbc 100644
--- a/nixos/big_data.nix
+++ b/nixos/big_data.nix
@@ -74,6 +74,7 @@ let
       spark.history.kerberos.principal				spark/my.engine@MY.ENGINE
       spark.yarn.appMasterEnv.PYSPARK_PYTHON	${pysparkEnv.outPath}/bin/${pysparkEnv.executable}
       spark.yarn.appMasterEnv.PYTHONPATH			${pysparkEnv.outPath}/lib/${pysparkEnv.executable}/site-packages
+      spark.executorEnv.PYSPARK_PYTHON        ${pysparkEnv.outPath}/bin/${pysparkEnv.executable}
       STOP
     '';
   };