From 72df315b4de9a5e72c618c8a932c784404a7b644 Mon Sep 17 00:00:00 2001 From: Filippo Berto Date: Sat, 12 Feb 2022 09:35:54 +0100 Subject: [PATCH 01/12] Big data configuration --- nixos/thor.nix | 138 +++++++++++++++- spark_conf/fairscheduler.xml.template | 31 ++++ spark_conf/log4j.properties | 46 ++++++ spark_conf/log4j.properties.template | 46 ++++++ spark_conf/metrics.properties.template | 210 ++++++++++++++++++++++++ spark_conf/spark-defaults.conf | 39 +++++ spark_conf/spark-defaults.conf.template | 27 +++ spark_conf/spark-env.sh | 7 + spark_conf/spark-env.sh.template | 73 ++++++++ spark_conf/workers.template | 19 +++ 10 files changed, 634 insertions(+), 2 deletions(-) create mode 100644 spark_conf/fairscheduler.xml.template create mode 100644 spark_conf/log4j.properties create mode 100644 spark_conf/log4j.properties.template create mode 100644 spark_conf/metrics.properties.template create mode 100644 spark_conf/spark-defaults.conf create mode 100644 spark_conf/spark-defaults.conf.template create mode 100644 spark_conf/spark-env.sh create mode 100755 spark_conf/spark-env.sh.template create mode 100644 spark_conf/workers.template diff --git a/nixos/thor.nix b/nixos/thor.nix index 6e7eaad..a2a8371 100644 --- a/nixos/thor.nix +++ b/nixos/thor.nix @@ -3,6 +3,46 @@ # and in the NixOS manual (accessible by running ‘nixos-help’). { config, pkgs, lib, ... }: + +let sparkConfDir = pkgs.stdenv.mkDerivation { + name = "spark-config"; + dontUnpack = true; + installPhase = '' + # source standard environment + . $stdenv/setup + + # shorthands + base_conf=${pkgs.spark}/lib/${pkgs.spark.untarDir}/conf/ + + # create output dirs for new derivation + mkdir -p $out/ + + # link unchanged files from the original gnome-session + for f in $base_conf/*.template ; do + ln -sf $f $out/ + done + + # change selected files + cp $out/log4j.properties{.template,} + + cat > $out/spark-env.sh <<- STOP + export JAVA_HOME="${pkgs.jdk8}" + export SPARK_HOME="${pkgs.spark}/lib/${pkgs.spark.untarDir}" + export SPARK_DIST_CLASSPATH=$(${pkgs.hadoop}/bin/hadoop classpath) + export PYSPARK_PYTHON="${pkgs.python3Packages.python}/bin/${pkgs.python3Packages.python.executable}" + export PYTHONPATH="\$PYTHONPATH:$PYTHONPATH" + export SPARKR_R_SHELL="${pkgs.R}/bin/R" + export PATH="\$PATH:${pkgs.R}/bin" + STOP + + cat > $out/spark-defaults.conf <<- STOP + spark.eventLog.enabled true + spark.eventLog.dir hdfs://localhost:/logs/spark + spark.history.fs.logDirectory hdfs://localhost:/logs/spark + STOP + ''; +}; +in { imports = [ @@ -102,7 +142,7 @@ }; }; clamav = { daemon.enable = true; updater.enable = true; }; - dbus.packages = with pkgs; [ gnome.dconf ]; + dbus.packages = with pkgs; [ pkgs.dconf ]; gnome.gnome-keyring.enable = true; gvfs = { enable = true; package = pkgs.gnome3.gvfs; }; fwupd.enable = true; @@ -154,11 +194,104 @@ }; # gnome.gnome-remote-desktop.enable = true; zerotierone = { enable = true; joinNetworks = [ "8056c2e21cf9c753" ]; }; + + + + spark = { + master = { + enable = true; + restartIfChanged = true; + }; + worker = { + enable = true; + restartIfChanged = true; + }; + confDir = sparkConfDir; + }; + + hadoop = { + coreSite = { + "fs.defaultFS" = "hdfs://localhost:8020"; + }; + hdfsSite = { + "dfs.namenode.rpc-bind-host" = "0.0.0.0"; + "dfs.permissions" = "false"; + }; + + hdfs = { + namenode = { + enable = true; + formatOnInit = true; + restartIfChanged = true; + }; + datanode = { + enable = true; + restartIfChanged = true; + }; + journalnode = { + enable = true; + restartIfChanged = true; + }; + zkfc = { + enable = true; + restartIfChanged = true; + }; + httpfs = { + enable = true; + restartIfChanged = true; + }; + }; + yarn = { + resourcemanager.enable = true; + nodemanager.enable = true; + }; + }; + + + ethminer = { + enable = false; + wallet = "0x73b788882e1C182123333f42FFf275B7dd7f51bb"; + toolkit = "opencl"; + rig = "thor"; + pool = "eth-eu1.nanopool.org"; + stratumPort = 9999; + + registerMail = ""; + }; + + + # teamviewer.enable = true; + }; + + systemd.services.spark-history = { + path = with pkgs; [ procps openssh nettools ]; + description = "spark history service."; + after = [ "network.target" ]; + wantedBy = [ "multi-user.target" ]; + restartIfChanged = true; + environment = { + SPARK_CONF_DIR = sparkConfDir; + SPARK_LOG_DIR = "/var/log/spark"; + }; + serviceConfig = { + Type = "forking"; + User = "spark"; + Group = "spark"; + WorkingDirectory = "${pkgs.spark}/lib/${pkgs.spark.untarDir}"; + ExecStart = "${pkgs.spark}/lib/${pkgs.spark.untarDir}/sbin/start-history-server.sh"; + ExecStop = "${pkgs.spark}/lib/${pkgs.spark.untarDir}/sbin/stop-history-server.sh"; + TimeoutSec = 300; + StartLimitBurst = 10; + Restart = "always"; + }; }; services.teamviewer.enable = true; security = { + pam.services."kde" = { + enableKwallet = true; + }; rtkit.enable = true; sudo.extraConfig = '' Defaults pwfeedback @@ -187,7 +320,8 @@ allowUnfree = true; packageOverrides = pkgs: { steam = pkgs.steam.override { - extraPkgs = pkgs: with pkgs; [ icu ]; + extraPkgs = pkgs: with pkgs; [ ]; + extraLibraries = pkgs: with pkgs; [ fontconfig.lib icu freetype ]; }; }; # cudaSupport = true; diff --git a/spark_conf/fairscheduler.xml.template b/spark_conf/fairscheduler.xml.template new file mode 100644 index 0000000..385b2e7 --- /dev/null +++ b/spark_conf/fairscheduler.xml.template @@ -0,0 +1,31 @@ + + + + + + + FAIR + 1 + 2 + + + FIFO + 2 + 3 + + diff --git a/spark_conf/log4j.properties b/spark_conf/log4j.properties new file mode 100644 index 0000000..dc7b9ea --- /dev/null +++ b/spark_conf/log4j.properties @@ -0,0 +1,46 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set everything to be logged to the console +log4j.rootCategory=INFO, console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +# Set the default spark-shell log level to WARN. When running the spark-shell, the +# log level for this class is used to overwrite the root logger's log level, so that +# the user can have different defaults for the shell and regular Spark apps. +log4j.logger.org.apache.spark.repl.Main=WARN + +# Settings to quiet third party logs that are too verbose +log4j.logger.org.sparkproject.jetty=WARN +log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR +log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO +log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO +log4j.logger.org.apache.parquet=ERROR +log4j.logger.parquet=ERROR + +# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support +log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL +log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR + +# For deploying Spark ThriftServer +# SPARK-34128:Suppress undesirable TTransportException warnings involved in THRIFT-4805 +log4j.appender.console.filter.1=org.apache.log4j.varia.StringMatchFilter +log4j.appender.console.filter.1.StringToMatch=Thrift error occurred during processing of message +log4j.appender.console.filter.1.AcceptOnMatch=false diff --git a/spark_conf/log4j.properties.template b/spark_conf/log4j.properties.template new file mode 100644 index 0000000..dc7b9ea --- /dev/null +++ b/spark_conf/log4j.properties.template @@ -0,0 +1,46 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set everything to be logged to the console +log4j.rootCategory=INFO, console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +# Set the default spark-shell log level to WARN. When running the spark-shell, the +# log level for this class is used to overwrite the root logger's log level, so that +# the user can have different defaults for the shell and regular Spark apps. +log4j.logger.org.apache.spark.repl.Main=WARN + +# Settings to quiet third party logs that are too verbose +log4j.logger.org.sparkproject.jetty=WARN +log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR +log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO +log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO +log4j.logger.org.apache.parquet=ERROR +log4j.logger.parquet=ERROR + +# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support +log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL +log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR + +# For deploying Spark ThriftServer +# SPARK-34128:Suppress undesirable TTransportException warnings involved in THRIFT-4805 +log4j.appender.console.filter.1=org.apache.log4j.varia.StringMatchFilter +log4j.appender.console.filter.1.StringToMatch=Thrift error occurred during processing of message +log4j.appender.console.filter.1.AcceptOnMatch=false diff --git a/spark_conf/metrics.properties.template b/spark_conf/metrics.properties.template new file mode 100644 index 0000000..f52d33f --- /dev/null +++ b/spark_conf/metrics.properties.template @@ -0,0 +1,210 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# syntax: [instance].sink|source.[name].[options]=[value] + +# This file configures Spark's internal metrics system. The metrics system is +# divided into instances which correspond to internal components. +# Each instance can be configured to report its metrics to one or more sinks. +# Accepted values for [instance] are "master", "worker", "executor", "driver", +# and "applications". A wildcard "*" can be used as an instance name, in +# which case all instances will inherit the supplied property. +# +# Within an instance, a "source" specifies a particular set of grouped metrics. +# there are two kinds of sources: +# 1. Spark internal sources, like MasterSource, WorkerSource, etc, which will +# collect a Spark component's internal state. Each instance is paired with a +# Spark source that is added automatically. +# 2. Common sources, like JvmSource, which will collect low level state. +# These can be added through configuration options and are then loaded +# using reflection. +# +# A "sink" specifies where metrics are delivered to. Each instance can be +# assigned one or more sinks. +# +# The sink|source field specifies whether the property relates to a sink or +# source. +# +# The [name] field specifies the name of source or sink. +# +# The [options] field is the specific property of this source or sink. The +# source or sink is responsible for parsing this property. +# +# Notes: +# 1. To add a new sink, set the "class" option to a fully qualified class +# name (see examples below). +# 2. Some sinks involve a polling period. The minimum allowed polling period +# is 1 second. +# 3. Wildcard properties can be overridden by more specific properties. +# For example, master.sink.console.period takes precedence over +# *.sink.console.period. +# 4. A metrics specific configuration +# "spark.metrics.conf=${SPARK_HOME}/conf/metrics.properties" should be +# added to Java properties using -Dspark.metrics.conf=xxx if you want to +# customize metrics system. You can also put the file in ${SPARK_HOME}/conf +# and it will be loaded automatically. +# 5. The MetricsServlet sink is added by default as a sink in the master, +# worker and driver, and you can send HTTP requests to the "/metrics/json" +# endpoint to get a snapshot of all the registered metrics in JSON format. +# For master, requests to the "/metrics/master/json" and +# "/metrics/applications/json" endpoints can be sent separately to get +# metrics snapshots of the master instance and applications. This +# MetricsServlet does not have to be configured. +# 6. The metrics system can also be configured using Spark configuration +# parameters. The relevant parameter names are formed by adding the +# prefix "spark.metrics.conf." to the configuration entries detailed in +# this file (see examples below). + +## List of available common sources and their properties. + +# org.apache.spark.metrics.source.JvmSource +# Note: Currently, JvmSource is the only available common source. +# It can be added to an instance by setting the "class" option to its +# fully qualified class name (see examples below). + +## List of available sinks and their properties. + +# org.apache.spark.metrics.sink.ConsoleSink +# Name: Default: Description: +# period 10 Poll period +# unit seconds Unit of the poll period + +# org.apache.spark.metrics.sink.CSVSink +# Name: Default: Description: +# period 10 Poll period +# unit seconds Unit of the poll period +# directory /tmp Where to store CSV files + +# org.apache.spark.metrics.sink.GangliaSink +# Name: Default: Description: +# host NONE Hostname or multicast group of the Ganglia server, +# must be set +# port NONE Port of the Ganglia server(s), must be set +# period 10 Poll period +# unit seconds Unit of the poll period +# ttl 1 TTL of messages sent by Ganglia +# dmax 0 Lifetime in seconds of metrics (0 never expired) +# mode multicast Ganglia network mode ('unicast' or 'multicast') + +# org.apache.spark.metrics.sink.JmxSink + +# org.apache.spark.metrics.sink.MetricsServlet +# Name: Default: Description: +# path VARIES* Path prefix from the web server root +# sample false Whether to show entire set of samples for histograms +# ('false' or 'true') +# +# * Default path is /metrics/json for all instances except the master. The +# master has two paths: +# /metrics/applications/json # App information +# /metrics/master/json # Master information + +# org.apache.spark.metrics.sink.PrometheusServlet +# Name: Default: Description: +# path VARIES* Path prefix from the web server root +# +# * Default path is /metrics/prometheus for all instances except the master. The +# master has two paths: +# /metrics/applications/prometheus # App information +# /metrics/master/prometheus # Master information + +# org.apache.spark.metrics.sink.GraphiteSink +# Name: Default: Description: +# host NONE Hostname of the Graphite server, must be set +# port NONE Port of the Graphite server, must be set +# period 10 Poll period +# unit seconds Unit of the poll period +# prefix EMPTY STRING Prefix to prepend to every metric's name +# protocol tcp Protocol ("tcp" or "udp") to use +# regex NONE Optional filter to send only metrics matching this regex string + +# org.apache.spark.metrics.sink.StatsdSink +# Name: Default: Description: +# host 127.0.0.1 Hostname or IP of StatsD server +# port 8125 Port of StatsD server +# period 10 Poll period +# unit seconds Units of poll period +# prefix EMPTY STRING Prefix to prepend to metric name + +## Examples +# Enable JmxSink for all instances by class name +#*.sink.jmx.class=org.apache.spark.metrics.sink.JmxSink + +# Enable ConsoleSink for all instances by class name +#*.sink.console.class=org.apache.spark.metrics.sink.ConsoleSink + +# Enable StatsdSink for all instances by class name +#*.sink.statsd.class=org.apache.spark.metrics.sink.StatsdSink +#*.sink.statsd.prefix=spark + +# Polling period for the ConsoleSink +#*.sink.console.period=10 +# Unit of the polling period for the ConsoleSink +#*.sink.console.unit=seconds + +# Polling period for the ConsoleSink specific for the master instance +#master.sink.console.period=15 +# Unit of the polling period for the ConsoleSink specific for the master +# instance +#master.sink.console.unit=seconds + +# Enable CsvSink for all instances by class name +#*.sink.csv.class=org.apache.spark.metrics.sink.CsvSink + +# Polling period for the CsvSink +#*.sink.csv.period=1 +# Unit of the polling period for the CsvSink +#*.sink.csv.unit=minutes + +# Polling directory for CsvSink +#*.sink.csv.directory=/tmp/ + +# Polling period for the CsvSink specific for the worker instance +#worker.sink.csv.period=10 +# Unit of the polling period for the CsvSink specific for the worker instance +#worker.sink.csv.unit=minutes + +# Enable Slf4jSink for all instances by class name +#*.sink.slf4j.class=org.apache.spark.metrics.sink.Slf4jSink + +# Polling period for the Slf4JSink +#*.sink.slf4j.period=1 +# Unit of the polling period for the Slf4jSink +#*.sink.slf4j.unit=minutes + +# Example configuration for Graphite sink +#*.sink.graphite.class=org.apache.spark.metrics.sink.GraphiteSink +#*.sink.graphite.host= +#*.sink.graphite.port= +#*.sink.graphite.period=10 +#*.sink.graphite.unit=seconds +#*.sink.graphite.prefix= + +# Enable JvmSource for instance master, worker, driver and executor +#master.source.jvm.class=org.apache.spark.metrics.source.JvmSource + +#worker.source.jvm.class=org.apache.spark.metrics.source.JvmSource + +#driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource + +#executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource + +# Example configuration for PrometheusServlet +#*.sink.prometheusServlet.class=org.apache.spark.metrics.sink.PrometheusServlet +#*.sink.prometheusServlet.path=/metrics/prometheus +#master.sink.prometheusServlet.path=/metrics/master/prometheus +#applications.sink.prometheusServlet.path=/metrics/applications/prometheus diff --git a/spark_conf/spark-defaults.conf b/spark_conf/spark-defaults.conf new file mode 100644 index 0000000..678d9c8 --- /dev/null +++ b/spark_conf/spark-defaults.conf @@ -0,0 +1,39 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Default system properties included when running spark-submit. +# This is useful for setting default environmental settings. + +# Example: +# spark.master spark://master:7077 +# spark.eventLog.enabled true +# spark.eventLog.dir hdfs://namenode:8021/directory +# spark.serializer org.apache.spark.serializer.KryoSerializer +# spark.driver.memory 5g +# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" + +# spark.io.compression.codec lzf +# spark.io.compression.codec org.apache.spark.io.SnappyCompressionCodec +# spark.eventLog.compress false + +spark.eventLog.enabled true +spark.eventLog.dir hdfs://localhost:/logs/spark +spark.history.fs.logDirectory hdfs://localhost:/logs/spark + + +# +# diff --git a/spark_conf/spark-defaults.conf.template b/spark_conf/spark-defaults.conf.template new file mode 100644 index 0000000..19cba6e --- /dev/null +++ b/spark_conf/spark-defaults.conf.template @@ -0,0 +1,27 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Default system properties included when running spark-submit. +# This is useful for setting default environmental settings. + +# Example: +# spark.master spark://master:7077 +# spark.eventLog.enabled true +# spark.eventLog.dir hdfs://namenode:8021/directory +# spark.serializer org.apache.spark.serializer.KryoSerializer +# spark.driver.memory 5g +# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" diff --git a/spark_conf/spark-env.sh b/spark_conf/spark-env.sh new file mode 100644 index 0000000..1d67e58 --- /dev/null +++ b/spark_conf/spark-env.sh @@ -0,0 +1,7 @@ +export JAVA_HOME="/nix/store/d0akdmr675jrlabv7n8syg8yrg1zlyxz-openjdk-8u272-b10" +export SPARK_HOME="/nix/store/zhj5q1pi0bs2lpc0lbkw8qkg03ywx9b8-spark-3.1.2/lib/spark-3.1.2" +export SPARK_DIST_CLASSPATH=/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/common/lib/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/common/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/hdfs:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/hdfs/lib/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/hdfs/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/mapreduce/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/yarn:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/yarn/lib/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/yarn/* +export PYSPARK_PYTHON="/nix/store/2c9w4p2x6x0l64fdvcmc11app7x4xran-python3-3.9.6/bin/python3.9" +export PYTHONPATH="$PYTHONPATH:/nix/store/2c9w4p2x6x0l64fdvcmc11app7x4xran-python3-3.9.6/lib/python3.9/site-packages" +export SPARKR_R_SHELL="/nix/store/h1s3y5jjrwdm5gd2qyxp2ldsnykippcb-R-4.1.2/bin/R" +export PATH="$PATH:/nix/store/h1s3y5jjrwdm5gd2qyxp2ldsnykippcb-R-4.1.2/bin" diff --git a/spark_conf/spark-env.sh.template b/spark_conf/spark-env.sh.template new file mode 100755 index 0000000..c868650 --- /dev/null +++ b/spark_conf/spark-env.sh.template @@ -0,0 +1,73 @@ +#!/nix/store/vfai0jim0db67nk9rd7ziq29jxb5n79n-bash-5.1-p8/bin/bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This file is sourced when running various Spark programs. +# Copy it as spark-env.sh and edit that to configure Spark for your site. + +# Options read when launching programs locally with +# ./bin/run-example or ./bin/spark-submit +# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files +# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node +# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program + +# Options read by executors and drivers running inside the cluster +# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node +# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program +# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data +# - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos + +# Options read in YARN client/cluster mode +# - SPARK_CONF_DIR, Alternate conf dir. (Default: ${SPARK_HOME}/conf) +# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files +# - YARN_CONF_DIR, to point Spark towards YARN configuration files when you use YARN +# - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1). +# - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G) +# - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G) + +# Options for the daemons used in the standalone deploy mode +# - SPARK_MASTER_HOST, to bind the master to a different IP address or hostname +# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master +# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y") +# - SPARK_WORKER_CORES, to set the number of cores to use on this machine +# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g) +# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker +# - SPARK_WORKER_DIR, to set the working directory of worker processes +# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y") +# - SPARK_DAEMON_MEMORY, to allocate to the master, worker and history server themselves (default: 1g). +# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y") +# - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y") +# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y") +# - SPARK_DAEMON_CLASSPATH, to set the classpath for all daemons +# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers + +# Options for launcher +# - SPARK_LAUNCHER_OPTS, to set config properties and Java options for the launcher (e.g. "-Dx=y") + +# Generic options for the daemons used in the standalone deploy mode +# - SPARK_CONF_DIR Alternate conf dir. (Default: ${SPARK_HOME}/conf) +# - SPARK_LOG_DIR Where log files are stored. (Default: ${SPARK_HOME}/logs) +# - SPARK_LOG_MAX_FILES Max log files of Spark daemons can rotate to. Default is 5. +# - SPARK_PID_DIR Where the pid file is stored. (Default: /tmp) +# - SPARK_IDENT_STRING A string representing this instance of spark. (Default: $USER) +# - SPARK_NICENESS The scheduling priority for daemons. (Default: 0) +# - SPARK_NO_DAEMONIZE Run the proposed command in the foreground. It will not output a PID file. +# Options for native BLAS, like Intel MKL, OpenBLAS, and so on. +# You might get better performance to enable these options if using native BLAS (see SPARK-21305). +# - MKL_NUM_THREADS=1 Disable multi-threading of Intel MKL +# - OPENBLAS_NUM_THREADS=1 Disable multi-threading of OpenBLAS diff --git a/spark_conf/workers.template b/spark_conf/workers.template new file mode 100644 index 0000000..be42a63 --- /dev/null +++ b/spark_conf/workers.template @@ -0,0 +1,19 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# A Spark Worker will be started on each of the machines listed below. +localhost \ No newline at end of file From 9e4879f7a0f8b36015d4b91c59bfe540f8d42d90 Mon Sep 17 00:00:00 2001 From: Filippo Berto Date: Mon, 21 Feb 2022 14:37:23 +0100 Subject: [PATCH 02/12] Big data config for laptop --- nixos/base.nix | 120 +++++++++++++- spark_conf/fairscheduler.xml.template | 31 ---- spark_conf/log4j.properties | 46 ------ spark_conf/log4j.properties.template | 46 ------ spark_conf/metrics.properties.template | 210 ------------------------ spark_conf/spark-defaults.conf | 39 ----- spark_conf/spark-defaults.conf.template | 27 --- spark_conf/spark-env.sh | 7 - spark_conf/spark-env.sh.template | 73 -------- spark_conf/workers.template | 19 --- 10 files changed, 119 insertions(+), 499 deletions(-) delete mode 100644 spark_conf/fairscheduler.xml.template delete mode 100644 spark_conf/log4j.properties delete mode 100644 spark_conf/log4j.properties.template delete mode 100644 spark_conf/metrics.properties.template delete mode 100644 spark_conf/spark-defaults.conf delete mode 100644 spark_conf/spark-defaults.conf.template delete mode 100644 spark_conf/spark-env.sh delete mode 100755 spark_conf/spark-env.sh.template delete mode 100644 spark_conf/workers.template diff --git a/nixos/base.nix b/nixos/base.nix index 4896ec9..4a68410 100644 --- a/nixos/base.nix +++ b/nixos/base.nix @@ -1,4 +1,46 @@ -{ config, pkgs, lib, ... }: { +{ config, pkgs, lib, ... }: + +let sparkConfDir = pkgs.stdenv.mkDerivation { + name = "spark-config"; + dontUnpack = true; + installPhase = '' + # source standard environment + . $stdenv/setup + + # shorthands + base_conf=${pkgs.spark}/lib/${pkgs.spark.untarDir}/conf/ + + # create output dirs for new derivation + mkdir -p $out/ + + # link unchanged files from the original gnome-session + for f in $base_conf/*.template ; do + ln -sf $f $out/ + done + + # change selected files + cp $out/log4j.properties{.template,} + + cat > $out/spark-env.sh <<- STOP + export JAVA_HOME="${pkgs.jdk8}" + export SPARK_HOME="${pkgs.spark}/lib/${pkgs.spark.untarDir}" + export SPARK_DIST_CLASSPATH=$(${pkgs.hadoop}/bin/hadoop classpath) + export PYSPARK_PYTHON="${pkgs.python3Packages.python}/bin/${pkgs.python3Packages.python.executable}" + export PYTHONPATH="\$PYTHONPATH:$PYTHONPATH" + export SPARKR_R_SHELL="${pkgs.R}/bin/R" + export PATH="\$PATH:${pkgs.R}/bin" + STOP + + cat > $out/spark-defaults.conf <<- STOP + spark.eventLog.enabled true + spark.eventLog.dir hdfs://localhost:/logs/spark + spark.history.fs.logDirectory hdfs://localhost:/logs/spark + STOP + ''; +}; +in + +{ imports = [ @@ -210,6 +252,82 @@ services.power-profiles-daemon.enable = true; # services.teamviewer.enable = true; + services = { + + spark = { + master = { + enable = true; + restartIfChanged = true; + }; + worker = { + enable = true; + restartIfChanged = true; + }; + confDir = sparkConfDir; + }; + + hadoop = { + coreSite = { + "fs.defaultFS" = "hdfs://localhost:8020"; + }; + hdfsSite = { + "dfs.namenode.rpc-bind-host" = "0.0.0.0"; + "dfs.permissions" = "false"; + }; + + hdfs = { + namenode = { + enable = true; + formatOnInit = true; + restartIfChanged = true; + }; + datanode = { + enable = true; + restartIfChanged = true; + }; + journalnode = { + enable = true; + restartIfChanged = true; + }; + zkfc = { + enable = true; + restartIfChanged = true; + }; + httpfs = { + enable = true; + restartIfChanged = true; + }; + }; + yarn = { + resourcemanager.enable = true; + nodemanager.enable = true; + }; + }; + }; + + systemd.services.spark-history = { + path = with pkgs; [ procps openssh nettools ]; + description = "spark history service."; + after = [ "network.target" ]; + wantedBy = [ "multi-user.target" ]; + restartIfChanged = true; + environment = { + SPARK_CONF_DIR = sparkConfDir; + SPARK_LOG_DIR = "/var/log/spark"; + }; + serviceConfig = { + Type = "forking"; + User = "spark"; + Group = "spark"; + WorkingDirectory = "${pkgs.spark}/lib/${pkgs.spark.untarDir}"; + ExecStart = "${pkgs.spark}/lib/${pkgs.spark.untarDir}/sbin/start-history-server.sh"; + ExecStop = "${pkgs.spark}/lib/${pkgs.spark.untarDir}/sbin/stop-history-server.sh"; + TimeoutSec = 300; + StartLimitBurst = 10; + Restart = "always"; + }; + }; + # Virtualisation virtualisation = { diff --git a/spark_conf/fairscheduler.xml.template b/spark_conf/fairscheduler.xml.template deleted file mode 100644 index 385b2e7..0000000 --- a/spark_conf/fairscheduler.xml.template +++ /dev/null @@ -1,31 +0,0 @@ - - - - - - - FAIR - 1 - 2 - - - FIFO - 2 - 3 - - diff --git a/spark_conf/log4j.properties b/spark_conf/log4j.properties deleted file mode 100644 index dc7b9ea..0000000 --- a/spark_conf/log4j.properties +++ /dev/null @@ -1,46 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Set everything to be logged to the console -log4j.rootCategory=INFO, console -log4j.appender.console=org.apache.log4j.ConsoleAppender -log4j.appender.console.target=System.err -log4j.appender.console.layout=org.apache.log4j.PatternLayout -log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n - -# Set the default spark-shell log level to WARN. When running the spark-shell, the -# log level for this class is used to overwrite the root logger's log level, so that -# the user can have different defaults for the shell and regular Spark apps. -log4j.logger.org.apache.spark.repl.Main=WARN - -# Settings to quiet third party logs that are too verbose -log4j.logger.org.sparkproject.jetty=WARN -log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR -log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO -log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO -log4j.logger.org.apache.parquet=ERROR -log4j.logger.parquet=ERROR - -# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support -log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL -log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR - -# For deploying Spark ThriftServer -# SPARK-34128:Suppress undesirable TTransportException warnings involved in THRIFT-4805 -log4j.appender.console.filter.1=org.apache.log4j.varia.StringMatchFilter -log4j.appender.console.filter.1.StringToMatch=Thrift error occurred during processing of message -log4j.appender.console.filter.1.AcceptOnMatch=false diff --git a/spark_conf/log4j.properties.template b/spark_conf/log4j.properties.template deleted file mode 100644 index dc7b9ea..0000000 --- a/spark_conf/log4j.properties.template +++ /dev/null @@ -1,46 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Set everything to be logged to the console -log4j.rootCategory=INFO, console -log4j.appender.console=org.apache.log4j.ConsoleAppender -log4j.appender.console.target=System.err -log4j.appender.console.layout=org.apache.log4j.PatternLayout -log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n - -# Set the default spark-shell log level to WARN. When running the spark-shell, the -# log level for this class is used to overwrite the root logger's log level, so that -# the user can have different defaults for the shell and regular Spark apps. -log4j.logger.org.apache.spark.repl.Main=WARN - -# Settings to quiet third party logs that are too verbose -log4j.logger.org.sparkproject.jetty=WARN -log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR -log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO -log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO -log4j.logger.org.apache.parquet=ERROR -log4j.logger.parquet=ERROR - -# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support -log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL -log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR - -# For deploying Spark ThriftServer -# SPARK-34128:Suppress undesirable TTransportException warnings involved in THRIFT-4805 -log4j.appender.console.filter.1=org.apache.log4j.varia.StringMatchFilter -log4j.appender.console.filter.1.StringToMatch=Thrift error occurred during processing of message -log4j.appender.console.filter.1.AcceptOnMatch=false diff --git a/spark_conf/metrics.properties.template b/spark_conf/metrics.properties.template deleted file mode 100644 index f52d33f..0000000 --- a/spark_conf/metrics.properties.template +++ /dev/null @@ -1,210 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# syntax: [instance].sink|source.[name].[options]=[value] - -# This file configures Spark's internal metrics system. The metrics system is -# divided into instances which correspond to internal components. -# Each instance can be configured to report its metrics to one or more sinks. -# Accepted values for [instance] are "master", "worker", "executor", "driver", -# and "applications". A wildcard "*" can be used as an instance name, in -# which case all instances will inherit the supplied property. -# -# Within an instance, a "source" specifies a particular set of grouped metrics. -# there are two kinds of sources: -# 1. Spark internal sources, like MasterSource, WorkerSource, etc, which will -# collect a Spark component's internal state. Each instance is paired with a -# Spark source that is added automatically. -# 2. Common sources, like JvmSource, which will collect low level state. -# These can be added through configuration options and are then loaded -# using reflection. -# -# A "sink" specifies where metrics are delivered to. Each instance can be -# assigned one or more sinks. -# -# The sink|source field specifies whether the property relates to a sink or -# source. -# -# The [name] field specifies the name of source or sink. -# -# The [options] field is the specific property of this source or sink. The -# source or sink is responsible for parsing this property. -# -# Notes: -# 1. To add a new sink, set the "class" option to a fully qualified class -# name (see examples below). -# 2. Some sinks involve a polling period. The minimum allowed polling period -# is 1 second. -# 3. Wildcard properties can be overridden by more specific properties. -# For example, master.sink.console.period takes precedence over -# *.sink.console.period. -# 4. A metrics specific configuration -# "spark.metrics.conf=${SPARK_HOME}/conf/metrics.properties" should be -# added to Java properties using -Dspark.metrics.conf=xxx if you want to -# customize metrics system. You can also put the file in ${SPARK_HOME}/conf -# and it will be loaded automatically. -# 5. The MetricsServlet sink is added by default as a sink in the master, -# worker and driver, and you can send HTTP requests to the "/metrics/json" -# endpoint to get a snapshot of all the registered metrics in JSON format. -# For master, requests to the "/metrics/master/json" and -# "/metrics/applications/json" endpoints can be sent separately to get -# metrics snapshots of the master instance and applications. This -# MetricsServlet does not have to be configured. -# 6. The metrics system can also be configured using Spark configuration -# parameters. The relevant parameter names are formed by adding the -# prefix "spark.metrics.conf." to the configuration entries detailed in -# this file (see examples below). - -## List of available common sources and their properties. - -# org.apache.spark.metrics.source.JvmSource -# Note: Currently, JvmSource is the only available common source. -# It can be added to an instance by setting the "class" option to its -# fully qualified class name (see examples below). - -## List of available sinks and their properties. - -# org.apache.spark.metrics.sink.ConsoleSink -# Name: Default: Description: -# period 10 Poll period -# unit seconds Unit of the poll period - -# org.apache.spark.metrics.sink.CSVSink -# Name: Default: Description: -# period 10 Poll period -# unit seconds Unit of the poll period -# directory /tmp Where to store CSV files - -# org.apache.spark.metrics.sink.GangliaSink -# Name: Default: Description: -# host NONE Hostname or multicast group of the Ganglia server, -# must be set -# port NONE Port of the Ganglia server(s), must be set -# period 10 Poll period -# unit seconds Unit of the poll period -# ttl 1 TTL of messages sent by Ganglia -# dmax 0 Lifetime in seconds of metrics (0 never expired) -# mode multicast Ganglia network mode ('unicast' or 'multicast') - -# org.apache.spark.metrics.sink.JmxSink - -# org.apache.spark.metrics.sink.MetricsServlet -# Name: Default: Description: -# path VARIES* Path prefix from the web server root -# sample false Whether to show entire set of samples for histograms -# ('false' or 'true') -# -# * Default path is /metrics/json for all instances except the master. The -# master has two paths: -# /metrics/applications/json # App information -# /metrics/master/json # Master information - -# org.apache.spark.metrics.sink.PrometheusServlet -# Name: Default: Description: -# path VARIES* Path prefix from the web server root -# -# * Default path is /metrics/prometheus for all instances except the master. The -# master has two paths: -# /metrics/applications/prometheus # App information -# /metrics/master/prometheus # Master information - -# org.apache.spark.metrics.sink.GraphiteSink -# Name: Default: Description: -# host NONE Hostname of the Graphite server, must be set -# port NONE Port of the Graphite server, must be set -# period 10 Poll period -# unit seconds Unit of the poll period -# prefix EMPTY STRING Prefix to prepend to every metric's name -# protocol tcp Protocol ("tcp" or "udp") to use -# regex NONE Optional filter to send only metrics matching this regex string - -# org.apache.spark.metrics.sink.StatsdSink -# Name: Default: Description: -# host 127.0.0.1 Hostname or IP of StatsD server -# port 8125 Port of StatsD server -# period 10 Poll period -# unit seconds Units of poll period -# prefix EMPTY STRING Prefix to prepend to metric name - -## Examples -# Enable JmxSink for all instances by class name -#*.sink.jmx.class=org.apache.spark.metrics.sink.JmxSink - -# Enable ConsoleSink for all instances by class name -#*.sink.console.class=org.apache.spark.metrics.sink.ConsoleSink - -# Enable StatsdSink for all instances by class name -#*.sink.statsd.class=org.apache.spark.metrics.sink.StatsdSink -#*.sink.statsd.prefix=spark - -# Polling period for the ConsoleSink -#*.sink.console.period=10 -# Unit of the polling period for the ConsoleSink -#*.sink.console.unit=seconds - -# Polling period for the ConsoleSink specific for the master instance -#master.sink.console.period=15 -# Unit of the polling period for the ConsoleSink specific for the master -# instance -#master.sink.console.unit=seconds - -# Enable CsvSink for all instances by class name -#*.sink.csv.class=org.apache.spark.metrics.sink.CsvSink - -# Polling period for the CsvSink -#*.sink.csv.period=1 -# Unit of the polling period for the CsvSink -#*.sink.csv.unit=minutes - -# Polling directory for CsvSink -#*.sink.csv.directory=/tmp/ - -# Polling period for the CsvSink specific for the worker instance -#worker.sink.csv.period=10 -# Unit of the polling period for the CsvSink specific for the worker instance -#worker.sink.csv.unit=minutes - -# Enable Slf4jSink for all instances by class name -#*.sink.slf4j.class=org.apache.spark.metrics.sink.Slf4jSink - -# Polling period for the Slf4JSink -#*.sink.slf4j.period=1 -# Unit of the polling period for the Slf4jSink -#*.sink.slf4j.unit=minutes - -# Example configuration for Graphite sink -#*.sink.graphite.class=org.apache.spark.metrics.sink.GraphiteSink -#*.sink.graphite.host= -#*.sink.graphite.port= -#*.sink.graphite.period=10 -#*.sink.graphite.unit=seconds -#*.sink.graphite.prefix= - -# Enable JvmSource for instance master, worker, driver and executor -#master.source.jvm.class=org.apache.spark.metrics.source.JvmSource - -#worker.source.jvm.class=org.apache.spark.metrics.source.JvmSource - -#driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource - -#executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource - -# Example configuration for PrometheusServlet -#*.sink.prometheusServlet.class=org.apache.spark.metrics.sink.PrometheusServlet -#*.sink.prometheusServlet.path=/metrics/prometheus -#master.sink.prometheusServlet.path=/metrics/master/prometheus -#applications.sink.prometheusServlet.path=/metrics/applications/prometheus diff --git a/spark_conf/spark-defaults.conf b/spark_conf/spark-defaults.conf deleted file mode 100644 index 678d9c8..0000000 --- a/spark_conf/spark-defaults.conf +++ /dev/null @@ -1,39 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Default system properties included when running spark-submit. -# This is useful for setting default environmental settings. - -# Example: -# spark.master spark://master:7077 -# spark.eventLog.enabled true -# spark.eventLog.dir hdfs://namenode:8021/directory -# spark.serializer org.apache.spark.serializer.KryoSerializer -# spark.driver.memory 5g -# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" - -# spark.io.compression.codec lzf -# spark.io.compression.codec org.apache.spark.io.SnappyCompressionCodec -# spark.eventLog.compress false - -spark.eventLog.enabled true -spark.eventLog.dir hdfs://localhost:/logs/spark -spark.history.fs.logDirectory hdfs://localhost:/logs/spark - - -# -# diff --git a/spark_conf/spark-defaults.conf.template b/spark_conf/spark-defaults.conf.template deleted file mode 100644 index 19cba6e..0000000 --- a/spark_conf/spark-defaults.conf.template +++ /dev/null @@ -1,27 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Default system properties included when running spark-submit. -# This is useful for setting default environmental settings. - -# Example: -# spark.master spark://master:7077 -# spark.eventLog.enabled true -# spark.eventLog.dir hdfs://namenode:8021/directory -# spark.serializer org.apache.spark.serializer.KryoSerializer -# spark.driver.memory 5g -# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" diff --git a/spark_conf/spark-env.sh b/spark_conf/spark-env.sh deleted file mode 100644 index 1d67e58..0000000 --- a/spark_conf/spark-env.sh +++ /dev/null @@ -1,7 +0,0 @@ -export JAVA_HOME="/nix/store/d0akdmr675jrlabv7n8syg8yrg1zlyxz-openjdk-8u272-b10" -export SPARK_HOME="/nix/store/zhj5q1pi0bs2lpc0lbkw8qkg03ywx9b8-spark-3.1.2/lib/spark-3.1.2" -export SPARK_DIST_CLASSPATH=/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/common/lib/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/common/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/hdfs:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/hdfs/lib/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/hdfs/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/mapreduce/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/yarn:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/yarn/lib/*:/nix/store/2b608nzvrsw1b8jd14vc77dc5j32q498-hadoop-3.3.1/lib/hadoop-3.3.1/share/hadoop/yarn/* -export PYSPARK_PYTHON="/nix/store/2c9w4p2x6x0l64fdvcmc11app7x4xran-python3-3.9.6/bin/python3.9" -export PYTHONPATH="$PYTHONPATH:/nix/store/2c9w4p2x6x0l64fdvcmc11app7x4xran-python3-3.9.6/lib/python3.9/site-packages" -export SPARKR_R_SHELL="/nix/store/h1s3y5jjrwdm5gd2qyxp2ldsnykippcb-R-4.1.2/bin/R" -export PATH="$PATH:/nix/store/h1s3y5jjrwdm5gd2qyxp2ldsnykippcb-R-4.1.2/bin" diff --git a/spark_conf/spark-env.sh.template b/spark_conf/spark-env.sh.template deleted file mode 100755 index c868650..0000000 --- a/spark_conf/spark-env.sh.template +++ /dev/null @@ -1,73 +0,0 @@ -#!/nix/store/vfai0jim0db67nk9rd7ziq29jxb5n79n-bash-5.1-p8/bin/bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# This file is sourced when running various Spark programs. -# Copy it as spark-env.sh and edit that to configure Spark for your site. - -# Options read when launching programs locally with -# ./bin/run-example or ./bin/spark-submit -# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files -# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node -# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program - -# Options read by executors and drivers running inside the cluster -# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node -# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program -# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data -# - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos - -# Options read in YARN client/cluster mode -# - SPARK_CONF_DIR, Alternate conf dir. (Default: ${SPARK_HOME}/conf) -# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files -# - YARN_CONF_DIR, to point Spark towards YARN configuration files when you use YARN -# - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1). -# - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G) -# - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G) - -# Options for the daemons used in the standalone deploy mode -# - SPARK_MASTER_HOST, to bind the master to a different IP address or hostname -# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master -# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y") -# - SPARK_WORKER_CORES, to set the number of cores to use on this machine -# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g) -# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker -# - SPARK_WORKER_DIR, to set the working directory of worker processes -# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y") -# - SPARK_DAEMON_MEMORY, to allocate to the master, worker and history server themselves (default: 1g). -# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y") -# - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y") -# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y") -# - SPARK_DAEMON_CLASSPATH, to set the classpath for all daemons -# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers - -# Options for launcher -# - SPARK_LAUNCHER_OPTS, to set config properties and Java options for the launcher (e.g. "-Dx=y") - -# Generic options for the daemons used in the standalone deploy mode -# - SPARK_CONF_DIR Alternate conf dir. (Default: ${SPARK_HOME}/conf) -# - SPARK_LOG_DIR Where log files are stored. (Default: ${SPARK_HOME}/logs) -# - SPARK_LOG_MAX_FILES Max log files of Spark daemons can rotate to. Default is 5. -# - SPARK_PID_DIR Where the pid file is stored. (Default: /tmp) -# - SPARK_IDENT_STRING A string representing this instance of spark. (Default: $USER) -# - SPARK_NICENESS The scheduling priority for daemons. (Default: 0) -# - SPARK_NO_DAEMONIZE Run the proposed command in the foreground. It will not output a PID file. -# Options for native BLAS, like Intel MKL, OpenBLAS, and so on. -# You might get better performance to enable these options if using native BLAS (see SPARK-21305). -# - MKL_NUM_THREADS=1 Disable multi-threading of Intel MKL -# - OPENBLAS_NUM_THREADS=1 Disable multi-threading of OpenBLAS diff --git a/spark_conf/workers.template b/spark_conf/workers.template deleted file mode 100644 index be42a63..0000000 --- a/spark_conf/workers.template +++ /dev/null @@ -1,19 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# A Spark Worker will be started on each of the machines listed below. -localhost \ No newline at end of file From 5a0f9cc5b4b15d4167da00e1997cf15e59afc79c Mon Sep 17 00:00:00 2001 From: Filippo Berto Date: Fri, 11 Mar 2022 12:27:00 +0100 Subject: [PATCH 03/12] Use ssd storage --- nixos/base.nix | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nixos/base.nix b/nixos/base.nix index 4a68410..69b4d73 100644 --- a/nixos/base.nix +++ b/nixos/base.nix @@ -273,6 +273,8 @@ in hdfsSite = { "dfs.namenode.rpc-bind-host" = "0.0.0.0"; "dfs.permissions" = "false"; + "dfs.namenode.name.dir" = "/hdfs/dfs/name"; + "dfs.datanode.data.dir" = "/hdfs/dfs/data"; }; hdfs = { From 1680b2bc455e86658b47ee6e090ed93d90b96383 Mon Sep 17 00:00:00 2001 From: Filippo Berto Date: Mon, 21 Mar 2022 09:50:26 +0100 Subject: [PATCH 04/12] Centralized big_data config --- nixos/base.nix | 120 +-------------------------------------------- nixos/big_data.nix | 120 +++++++++++++++++++++++++++++++++++++++++++++ nixos/thor.nix | 93 +---------------------------------- 3 files changed, 122 insertions(+), 211 deletions(-) create mode 100644 nixos/big_data.nix diff --git a/nixos/base.nix b/nixos/base.nix index 69b4d73..3240307 100644 --- a/nixos/base.nix +++ b/nixos/base.nix @@ -1,45 +1,4 @@ { config, pkgs, lib, ... }: - -let sparkConfDir = pkgs.stdenv.mkDerivation { - name = "spark-config"; - dontUnpack = true; - installPhase = '' - # source standard environment - . $stdenv/setup - - # shorthands - base_conf=${pkgs.spark}/lib/${pkgs.spark.untarDir}/conf/ - - # create output dirs for new derivation - mkdir -p $out/ - - # link unchanged files from the original gnome-session - for f in $base_conf/*.template ; do - ln -sf $f $out/ - done - - # change selected files - cp $out/log4j.properties{.template,} - - cat > $out/spark-env.sh <<- STOP - export JAVA_HOME="${pkgs.jdk8}" - export SPARK_HOME="${pkgs.spark}/lib/${pkgs.spark.untarDir}" - export SPARK_DIST_CLASSPATH=$(${pkgs.hadoop}/bin/hadoop classpath) - export PYSPARK_PYTHON="${pkgs.python3Packages.python}/bin/${pkgs.python3Packages.python.executable}" - export PYTHONPATH="\$PYTHONPATH:$PYTHONPATH" - export SPARKR_R_SHELL="${pkgs.R}/bin/R" - export PATH="\$PATH:${pkgs.R}/bin" - STOP - - cat > $out/spark-defaults.conf <<- STOP - spark.eventLog.enabled true - spark.eventLog.dir hdfs://localhost:/logs/spark - spark.history.fs.logDirectory hdfs://localhost:/logs/spark - STOP - ''; -}; -in - { imports = [ @@ -53,6 +12,7 @@ in # ./defcon.nix # ./mind.nix # ./k3s.nix + ./big_data.nix ]; # Use the systemd-boot EFI boot loader. @@ -252,84 +212,6 @@ in services.power-profiles-daemon.enable = true; # services.teamviewer.enable = true; - services = { - - spark = { - master = { - enable = true; - restartIfChanged = true; - }; - worker = { - enable = true; - restartIfChanged = true; - }; - confDir = sparkConfDir; - }; - - hadoop = { - coreSite = { - "fs.defaultFS" = "hdfs://localhost:8020"; - }; - hdfsSite = { - "dfs.namenode.rpc-bind-host" = "0.0.0.0"; - "dfs.permissions" = "false"; - "dfs.namenode.name.dir" = "/hdfs/dfs/name"; - "dfs.datanode.data.dir" = "/hdfs/dfs/data"; - }; - - hdfs = { - namenode = { - enable = true; - formatOnInit = true; - restartIfChanged = true; - }; - datanode = { - enable = true; - restartIfChanged = true; - }; - journalnode = { - enable = true; - restartIfChanged = true; - }; - zkfc = { - enable = true; - restartIfChanged = true; - }; - httpfs = { - enable = true; - restartIfChanged = true; - }; - }; - yarn = { - resourcemanager.enable = true; - nodemanager.enable = true; - }; - }; - }; - - systemd.services.spark-history = { - path = with pkgs; [ procps openssh nettools ]; - description = "spark history service."; - after = [ "network.target" ]; - wantedBy = [ "multi-user.target" ]; - restartIfChanged = true; - environment = { - SPARK_CONF_DIR = sparkConfDir; - SPARK_LOG_DIR = "/var/log/spark"; - }; - serviceConfig = { - Type = "forking"; - User = "spark"; - Group = "spark"; - WorkingDirectory = "${pkgs.spark}/lib/${pkgs.spark.untarDir}"; - ExecStart = "${pkgs.spark}/lib/${pkgs.spark.untarDir}/sbin/start-history-server.sh"; - ExecStop = "${pkgs.spark}/lib/${pkgs.spark.untarDir}/sbin/stop-history-server.sh"; - TimeoutSec = 300; - StartLimitBurst = 10; - Restart = "always"; - }; - }; - # Virtualisation virtualisation = { diff --git a/nixos/big_data.nix b/nixos/big_data.nix new file mode 100644 index 0000000..3d61957 --- /dev/null +++ b/nixos/big_data.nix @@ -0,0 +1,120 @@ +{ config, lib, pkgs, ... }: +let sparkConfDir = pkgs.stdenv.mkDerivation { + name = "spark-config"; + dontUnpack = true; + installPhase = '' + # source standard environment + . $stdenv/setup + + # shorthands + base_conf=${pkgs.spark}/lib/${pkgs.spark.untarDir}/conf/ + + # create output dirs for new derivation + mkdir -p $out/ + + # link unchanged files from the original gnome-session + for f in $base_conf/*.template ; do + ln -sf $f $out/ + done + + # change selected files + cp $out/log4j.properties{.template,} + + cat > $out/spark-env.sh <<- STOP + export JAVA_HOME="${pkgs.jdk8}" + export SPARK_HOME="${pkgs.spark}/lib/${pkgs.spark.untarDir}" + export SPARK_DIST_CLASSPATH=$(${pkgs.hadoop}/bin/hadoop classpath) + export PYSPARK_PYTHON="${pkgs.python3Packages.python}/bin/${pkgs.python3Packages.python.executable}" + export PYTHONPATH="\$PYTHONPATH:$PYTHONPATH" + export SPARKR_R_SHELL="${pkgs.R}/bin/R" + export PATH="\$PATH:${pkgs.R}/bin" + STOP + + cat > $out/spark-defaults.conf <<- STOP + spark.eventLog.enabled true + spark.eventLog.dir hdfs://localhost:/logs/spark + spark.history.fs.logDirectory hdfs://localhost:/logs/spark + STOP + ''; +}; +in +{ + + services = { + spark = { + master = { + enable = true; + restartIfChanged = true; + }; + worker = { + enable = true; + restartIfChanged = true; + }; + confDir = sparkConfDir; + }; + + hadoop = { + coreSite = { + "fs.defaultFS" = "hdfs://localhost:8020"; + }; + hdfsSite = { + "dfs.namenode.rpc-bind-host" = "0.0.0.0"; + "dfs.permissions" = "false"; + "dfs.namenode.name.dir" = "/hdfs/dfs/name"; + "dfs.datanode.data.dir" = "/hdfs/dfs/data"; + }; + + hdfs = { + namenode = { + enable = true; + formatOnInit = true; + restartIfChanged = true; + }; + datanode = { + enable = true; + restartIfChanged = true; + }; + journalnode = { + enable = true; + restartIfChanged = true; + }; + zkfc = { + enable = true; + restartIfChanged = true; + }; + httpfs = { + enable = true; + restartIfChanged = true; + }; + }; + yarn = { + resourcemanager.enable = true; + nodemanager.enable = true; + }; + }; + }; + + systemd.services.spark-history = { + path = with pkgs; [ procps openssh nettools ]; + description = "spark history service."; + after = [ "network.target" ]; + wantedBy = [ "multi-user.target" ]; + restartIfChanged = true; + environment = { + SPARK_CONF_DIR = sparkConfDir; + SPARK_LOG_DIR = "/var/log/spark"; + }; + serviceConfig = { + Type = "forking"; + User = "spark"; + Group = "spark"; + WorkingDirectory = "${pkgs.spark}/lib/${pkgs.spark.untarDir}"; + ExecStart = "${pkgs.spark}/lib/${pkgs.spark.untarDir}/sbin/start-history-server.sh"; + ExecStop = "${pkgs.spark}/lib/${pkgs.spark.untarDir}/sbin/stop-history-server.sh"; + TimeoutSec = 300; + StartLimitBurst = 10; + Restart = "always"; + }; + }; + +} diff --git a/nixos/thor.nix b/nixos/thor.nix index a2a8371..7b41f46 100644 --- a/nixos/thor.nix +++ b/nixos/thor.nix @@ -4,51 +4,13 @@ { config, pkgs, lib, ... }: -let sparkConfDir = pkgs.stdenv.mkDerivation { - name = "spark-config"; - dontUnpack = true; - installPhase = '' - # source standard environment - . $stdenv/setup - - # shorthands - base_conf=${pkgs.spark}/lib/${pkgs.spark.untarDir}/conf/ - - # create output dirs for new derivation - mkdir -p $out/ - - # link unchanged files from the original gnome-session - for f in $base_conf/*.template ; do - ln -sf $f $out/ - done - - # change selected files - cp $out/log4j.properties{.template,} - - cat > $out/spark-env.sh <<- STOP - export JAVA_HOME="${pkgs.jdk8}" - export SPARK_HOME="${pkgs.spark}/lib/${pkgs.spark.untarDir}" - export SPARK_DIST_CLASSPATH=$(${pkgs.hadoop}/bin/hadoop classpath) - export PYSPARK_PYTHON="${pkgs.python3Packages.python}/bin/${pkgs.python3Packages.python.executable}" - export PYTHONPATH="\$PYTHONPATH:$PYTHONPATH" - export SPARKR_R_SHELL="${pkgs.R}/bin/R" - export PATH="\$PATH:${pkgs.R}/bin" - STOP - - cat > $out/spark-defaults.conf <<- STOP - spark.eventLog.enabled true - spark.eventLog.dir hdfs://localhost:/logs/spark - spark.history.fs.logDirectory hdfs://localhost:/logs/spark - STOP - ''; -}; -in { imports = [ /etc/nixos/hardware-configuration.nix ./pro_audio.nix + ./big_data.nix ]; boot = { @@ -195,59 +157,6 @@ in # gnome.gnome-remote-desktop.enable = true; zerotierone = { enable = true; joinNetworks = [ "8056c2e21cf9c753" ]; }; - - - spark = { - master = { - enable = true; - restartIfChanged = true; - }; - worker = { - enable = true; - restartIfChanged = true; - }; - confDir = sparkConfDir; - }; - - hadoop = { - coreSite = { - "fs.defaultFS" = "hdfs://localhost:8020"; - }; - hdfsSite = { - "dfs.namenode.rpc-bind-host" = "0.0.0.0"; - "dfs.permissions" = "false"; - }; - - hdfs = { - namenode = { - enable = true; - formatOnInit = true; - restartIfChanged = true; - }; - datanode = { - enable = true; - restartIfChanged = true; - }; - journalnode = { - enable = true; - restartIfChanged = true; - }; - zkfc = { - enable = true; - restartIfChanged = true; - }; - httpfs = { - enable = true; - restartIfChanged = true; - }; - }; - yarn = { - resourcemanager.enable = true; - nodemanager.enable = true; - }; - }; - - ethminer = { enable = false; wallet = "0x73b788882e1C182123333f42FFf275B7dd7f51bb"; From 2bcd4152a991f6e25f4ec15d38d093322c94d5f5 Mon Sep 17 00:00:00 2001 From: Filippo Berto Date: Mon, 21 Mar 2022 12:21:22 +0100 Subject: [PATCH 05/12] Basic kerberos config --- nixos/big_data.nix | 36 ++++++++++++++++++++++++++++++++++++ odin.nix | 1 - 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/nixos/big_data.nix b/nixos/big_data.nix index 3d61957..44e8aeb 100644 --- a/nixos/big_data.nix +++ b/nixos/big_data.nix @@ -92,8 +92,44 @@ in nodemanager.enable = true; }; }; + + kerberos_server = { + enable = true; + realms."ATHENA.MIT.EDU" = { + acl = [ + { access = "all"; principal = "*/admin"; } + { access = "all"; principal = "admin"; } + ]; + }; + }; }; + krb5 = { + enable = true; + realms."ATHENA.MIT.EDU" = { + admin_server = "localhost"; + kdc = [ + "localhost" + ]; + kpasswd_server = "localhost"; + }; + domain_realm = { + ".athena.mit.edu" = "ATHENA.MIT.EDU"; + "athena.mit.edu" = "ATHENA.MIT.EDU"; + }; + libdefaults = { + default_realm = "ATHENA.MIT.EDU"; + dns_lookup_realm = false; + dns_lookup_kdc = false; + }; + extraConfig = '' + [logging] + default = FILE:/var/log/krb5.log + ''; + }; + + + systemd.services.spark-history = { path = with pkgs; [ procps openssh nettools ]; description = "spark history service."; diff --git a/odin.nix b/odin.nix index c5df576..528bf04 100644 --- a/odin.nix +++ b/odin.nix @@ -45,7 +45,6 @@ ark authy bitwarden - blender btop catgirl # IRC cava From 7ad8a748f1b4fce010d8f4d35f8be94ed4fc1a78 Mon Sep 17 00:00:00 2001 From: Filippo Berto Date: Thu, 24 Mar 2022 09:36:46 +0100 Subject: [PATCH 06/12] Kerberos config --- nixos/big_data.nix | 211 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 160 insertions(+), 51 deletions(-) diff --git a/nixos/big_data.nix b/nixos/big_data.nix index 44e8aeb..d4d161e 100644 --- a/nixos/big_data.nix +++ b/nixos/big_data.nix @@ -40,6 +40,17 @@ let sparkConfDir = pkgs.stdenv.mkDerivation { in { + networking = { + hosts = { + "127.0.0.1" = [ + "ds.my.engine" + "kdc.my.engine" + "my.engine" + ]; + }; + + }; + services = { spark = { master = { @@ -53,83 +64,181 @@ in confDir = sparkConfDir; }; - hadoop = { - coreSite = { - "fs.defaultFS" = "hdfs://localhost:8020"; - }; - hdfsSite = { - "dfs.namenode.rpc-bind-host" = "0.0.0.0"; - "dfs.permissions" = "false"; - "dfs.namenode.name.dir" = "/hdfs/dfs/name"; - "dfs.datanode.data.dir" = "/hdfs/dfs/data"; - }; - hdfs = { - namenode = { - enable = true; - formatOnInit = true; - restartIfChanged = true; + + hadoop = + let + keytab_path = /etc/hadoop.keytab; + in + + { + coreSite = { + # "fs.defaultFS" = "hdfs://0.0.0.0:8020"; + + # "hadoop.http.authentication.simple.anonymous.allowed" = "false"; + # "hadoop.http.authentication.signature.secret.file" = "/var/lib/hadoop/security/http_secret"; + # "hadoop.http.authentication.type" = "kerberos"; + # "hadoop.http.authentication.kerberos.principal" = "http/my.engine@MY.ENGINE"; + # "hadoop.http.authentication.cookie.domain" = "my.engine"; + + # "hadoop.security.authentication" = "kerberos"; + # "hadoop.security.authorization" = "true"; + # "hadoop.rpc.protection" = "authentication"; + + + # "hadoop.rpc.protection" = "authentication"; + # "hadoop.security.auth_to_local" = '' + # RULE:[2:$1/$2@$0]([ndj]n/.*@MY.ENGINE)s/.*/hdfs/ + # RULE:[2:$1/$2@$0]([rn]m/.*@MY.ENGINE)s/.*/yarn/ + # RULE:[2:$1/$2@$0](jhs/.*@MY.ENGINE)s/.*/mapred/ + # DEFAULT + # ''; + # "hadoop.proxyuser.superuser.hosts" = "*"; # TODO: restrict + # "hadoop.proxyuser.superuser.groups" = "*"; # TODO: restrict + + "fs.defaultFS" = "hdfs://my.engine:8020"; + + # HDFS IMPERSONATION + "hadoop.proxyuser.hdfs.hosts" = "*"; + "hadoop.proxyuser.hdfs.groups" = "*"; + + # HIVE IMPERSONATION + "hadoop.proxyuser.hive.hosts" = "*"; + "hadoop.proxyuser.hive.groups" = "*"; + + # ENABLE AUTHENTICATION + "hadoop.security.authentication" = "kerberos"; + "hadoop.security.authorization" = "true"; + "hadoop.rpc.protection" = "privacy"; + + "hadoop.security.auth_to_local" = '' + RULE:[2:$1/$2@$0]([ndj]n/.*@MY\.ENGINE)s/.*/hdfs/ + RULE:[2:$1/$2@$0]([rn]m/.*@MY\.ENGINE)s/.*/yarn/ + RULE:[2:$1/$2@$0](jhs/.*@MY\.ENGINE)s/.*/mapred/ + DEFAULT + ''; }; - datanode = { - enable = true; - restartIfChanged = true; + hdfsSite = { + # DATA + "dfs.namenode.name.dir" = "/hdfs/dfs/name"; + "dfs.datanode.data.dir" = "/hdfs/dfs/data"; + + # HDFS SECURITY + "dfs.block.access.token.enable" = "true"; + + # NAME NODE SECURITY + "dfs.namenode.keytab.file" = keytab_path; + "dfs.namenode.kerberos.principal" = "nn/my.engine@MY.ENGINE"; + "dfs.namenode.kerberos.internal.spnego.principal" = "HTTP/my.engine@MY.ENGINE"; + + # SECONDARY NAME NODE SECURITY + "dfs.secondary.namenode.keytab.file" = keytab_path; + "dfs.secondary.namenode.kerberos.principal" = "nn/my.engine@MY.ENGINE"; + "dfs.secondary.namenode.kerberos.internal.spnego.principal" = "HTTP/my.engine@MY.ENGINE"; + + # DATA NODE SECURITY + "dfs.datanode.keytab.file" = keytab_path; + "dfs.datanode.kerberos.principal" = "dn/my.engine@MY.ENGINE"; + + # WEBHDFS SECURITY + "dfs.webhdfs.enabled" = "true"; + + # WEB AUTHENTICATION CONFIG + "dfs.web.authentication.kerberos.principal" = "HTTP/my.engine@MY.ENGINE"; + "dfs.web.authentication.kerberos.keytab" = keytab_path; + "ignore.secure.ports.for.testing" = "true"; + "dfs.http.policy" = "HTTP_ONLY"; + "dfs.data.transfer.protection" = "privacy"; + + # ## MULTIHOMED + # "dfs.namenode.rpc-bind-host" = "0.0.0.0"; + # "dfs.namenode.servicerpc-bind-host" = "0.0.0.0"; + # "dfs.namenode.http-bind-host" = "0.0.0.0"; + # "dfs.namenode.https-bind-host" = "0.0.0.0"; + # "dfs.client.use.datanode.hostname" = "true"; # force connection by hostname + # "dfs.datanode.use.datanode.hostname" = "true"; # force connection by hostname + + + # "dfs.data.transfer.protection" = "privacy"; + # "hadoop.rpc.protection" = "privacy"; + # "dfs.http.policy" = "HTTP_ONLY"; + # "dfs.datanode.address" = "0.0.0.0:10019"; + # "dfs.datanode.http.address" = "0.0.0.0:10022"; + # "dfs.datanode.https.address" = "0.0.0.0:10023"; + + + # "dfs.datanode.kerberos.principal" = "dn/my.engine@MY.ENGINE"; + # "dfs.datanode.keytab.file" = keytab_path; + + # "dfs.namenode.kerberos.principal" = "nn/my.engine@MY.ENGINE"; + # "dfs.namenode.keytab.file" = keytab_path; + + # "dfs.block.access.token.enable" = "true"; + + }; - journalnode = { - enable = true; - restartIfChanged = true; + yarnSite = { + # "yarn.acl.enable" = "true"; + # "yarn.admin.acl" = "*"; # TODO: restrict }; - zkfc = { - enable = true; - restartIfChanged = true; - }; - httpfs = { - enable = true; - restartIfChanged = true; + extraConfDirs = [ ]; + + hdfs = { + namenode = { enable = true; formatOnInit = true; restartIfChanged = true; }; + datanode = { enable = true; restartIfChanged = true; }; + journalnode = { enable = true; restartIfChanged = true; }; + zkfc = { enable = true; restartIfChanged = true; }; + httpfs = { enable = true; restartIfChanged = true; }; }; + yarn = { resourcemanager.enable = true; nodemanager.enable = true; }; }; - yarn = { - resourcemanager.enable = true; - nodemanager.enable = true; - }; - }; kerberos_server = { enable = true; - realms."ATHENA.MIT.EDU" = { - acl = [ - { access = "all"; principal = "*/admin"; } - { access = "all"; principal = "admin"; } - ]; - }; + realms."MY.ENGINE".acl = [ + { principal = "*/admin"; access = "all"; } + { principal = "admin"; access = "all"; } + { principal = "*/localhost"; access = "all"; } + { principal = "*/my.engine"; access = "all"; } + { principal = "nn/my.engine"; access = "all"; } + { principal = "hdfs"; access = "all"; } + ]; }; }; krb5 = { enable = true; - realms."ATHENA.MIT.EDU" = { - admin_server = "localhost"; - kdc = [ - "localhost" - ]; - kpasswd_server = "localhost"; + realms = { + "MY.ENGINE" = { + admin_server = "kdc.my.engine"; + kdc = "kdc.my.engine"; + # default_domain = "my.engine"; + # kpasswd_server = "odin"; + }; }; domain_realm = { - ".athena.mit.edu" = "ATHENA.MIT.EDU"; - "athena.mit.edu" = "ATHENA.MIT.EDU"; + # ".my.engine" = "MY.ENGINE"; + "my.engine" = "MY.ENGINE"; }; libdefaults = { - default_realm = "ATHENA.MIT.EDU"; - dns_lookup_realm = false; - dns_lookup_kdc = false; + default_realm = "MY.ENGINE"; + dns_lookup_realm = true; + dns_lookup_kdc = true; + ticket_lifetime = "24h"; + renew_lifetime = "7d"; + forwardable = true; }; extraConfig = '' [logging] - default = FILE:/var/log/krb5.log + default = FILE:/var/log/krb5libs.log + kdc = FILE:/var/log/krb5kdc.log + admin_server = FILE:/var/log/kadmind.log ''; }; + systemd.services.spark-history = { path = with pkgs; [ procps openssh nettools ]; description = "spark history service."; From a27dbfd6813fe44c72beac93c20e8d40278889f7 Mon Sep 17 00:00:00 2001 From: Filippo Berto Date: Thu, 24 Mar 2022 17:51:24 +0100 Subject: [PATCH 07/12] HDSF + KERBEROS --- nixos/base.nix | 3 +- nixos/big_data.nix | 300 ++++++++++++++++++++++----------------------- 2 files changed, 146 insertions(+), 157 deletions(-) diff --git a/nixos/base.nix b/nixos/base.nix index 3240307..0b5ded8 100644 --- a/nixos/base.nix +++ b/nixos/base.nix @@ -1,6 +1,5 @@ { config, pkgs, lib, ... }: { - imports = [ @@ -171,7 +170,7 @@ }; }; - services.dbus.packages = with pkgs; [ gnome.dconf ]; + services.dbus.packages = with pkgs; [ dconf ]; services.gnome.gnome-keyring.enable = true; hardware.bluetooth.enable = true; # services.blueman.enable = true; diff --git a/nixos/big_data.nix b/nixos/big_data.nix index d4d161e..3efcad5 100644 --- a/nixos/big_data.nix +++ b/nixos/big_data.nix @@ -1,42 +1,52 @@ { config, lib, pkgs, ... }: -let sparkConfDir = pkgs.stdenv.mkDerivation { - name = "spark-config"; - dontUnpack = true; - installPhase = '' - # source standard environment - . $stdenv/setup +let + keytab_path = /etc/hadoop.keytab; + hadoopConf = import { + inherit pkgs lib; + cfg = config.services.hadoop; + }; + hadoopConfDir = "${hadoopConf}/"; + sparkConfDir = pkgs.stdenv.mkDerivation { + name = "spark-config"; + dontUnpack = true; + installPhase = '' + # source standard environment + . $stdenv/setup - # shorthands - base_conf=${pkgs.spark}/lib/${pkgs.spark.untarDir}/conf/ + # shorthands + base_conf=${pkgs.spark}/lib/${pkgs.spark.untarDir}/conf/ - # create output dirs for new derivation - mkdir -p $out/ + # create output dirs for new derivation + mkdir -p $out/ - # link unchanged files from the original gnome-session - for f in $base_conf/*.template ; do - ln -sf $f $out/ - done + # link unchanged files from the original gnome-session + for f in $base_conf/*.template ; do + ln -sf $f $out/ + done - # change selected files - cp $out/log4j.properties{.template,} + # change selected files + cp $out/log4j.properties{.template,} - cat > $out/spark-env.sh <<- STOP - export JAVA_HOME="${pkgs.jdk8}" - export SPARK_HOME="${pkgs.spark}/lib/${pkgs.spark.untarDir}" - export SPARK_DIST_CLASSPATH=$(${pkgs.hadoop}/bin/hadoop classpath) - export PYSPARK_PYTHON="${pkgs.python3Packages.python}/bin/${pkgs.python3Packages.python.executable}" - export PYTHONPATH="\$PYTHONPATH:$PYTHONPATH" - export SPARKR_R_SHELL="${pkgs.R}/bin/R" - export PATH="\$PATH:${pkgs.R}/bin" - STOP + cat > $out/spark-env.sh <<- STOP + export JAVA_HOME="${pkgs.jdk8}" + export SPARK_HOME="${pkgs.spark}/lib/${pkgs.spark.untarDir}" + export SPARK_DIST_CLASSPATH=$(${pkgs.hadoop}/bin/hadoop classpath) + export PYSPARK_PYTHON="${pkgs.python3Packages.python}/bin/${pkgs.python3Packages.python.executable}" + export PYTHONPATH="\$PYTHONPATH:$PYTHONPATH" + export HADOOP_CONF_DIR="${hadoopConfDir}" + export SPARKR_R_SHELL="${pkgs.R}/bin/R" + export PATH="\$PATH:${pkgs.R}/bin" + STOP - cat > $out/spark-defaults.conf <<- STOP - spark.eventLog.enabled true - spark.eventLog.dir hdfs://localhost:/logs/spark - spark.history.fs.logDirectory hdfs://localhost:/logs/spark - STOP - ''; -}; + cat > $out/spark-defaults.conf <<- STOP + spark.eventLog.enabled true + spark.eventLog.dir hdfs://localhost:/logs/spark + spark.history.fs.logDirectory hdfs://localhost:/logs/spark + spark.yarn.keytab ${keytab_path} + spark.yarn.principal spark/my.engine@MY.ENGINE + STOP + ''; + }; in { @@ -66,132 +76,113 @@ in - hadoop = - let - keytab_path = /etc/hadoop.keytab; - in + hadoop = { + coreSite = { + "fs.defaultFS" = "hdfs://my.engine:8020"; - { - coreSite = { - # "fs.defaultFS" = "hdfs://0.0.0.0:8020"; + # HDFS IMPERSONATION + "hadoop.proxyuser.hdfs.hosts" = "*"; + "hadoop.proxyuser.hdfs.groups" = "*"; - # "hadoop.http.authentication.simple.anonymous.allowed" = "false"; - # "hadoop.http.authentication.signature.secret.file" = "/var/lib/hadoop/security/http_secret"; - # "hadoop.http.authentication.type" = "kerberos"; - # "hadoop.http.authentication.kerberos.principal" = "http/my.engine@MY.ENGINE"; - # "hadoop.http.authentication.cookie.domain" = "my.engine"; + # HIVE IMPERSONATION + "hadoop.proxyuser.hive.hosts" = "*"; + "hadoop.proxyuser.hive.groups" = "*"; - # "hadoop.security.authentication" = "kerberos"; - # "hadoop.security.authorization" = "true"; - # "hadoop.rpc.protection" = "authentication"; + # ENABLE AUTHENTICATION + "hadoop.security.authentication" = "kerberos"; + "hadoop.security.authorization" = "true"; + "hadoop.rpc.protection" = "privacy"; - - # "hadoop.rpc.protection" = "authentication"; - # "hadoop.security.auth_to_local" = '' - # RULE:[2:$1/$2@$0]([ndj]n/.*@MY.ENGINE)s/.*/hdfs/ - # RULE:[2:$1/$2@$0]([rn]m/.*@MY.ENGINE)s/.*/yarn/ - # RULE:[2:$1/$2@$0](jhs/.*@MY.ENGINE)s/.*/mapred/ - # DEFAULT - # ''; - # "hadoop.proxyuser.superuser.hosts" = "*"; # TODO: restrict - # "hadoop.proxyuser.superuser.groups" = "*"; # TODO: restrict - - "fs.defaultFS" = "hdfs://my.engine:8020"; - - # HDFS IMPERSONATION - "hadoop.proxyuser.hdfs.hosts" = "*"; - "hadoop.proxyuser.hdfs.groups" = "*"; - - # HIVE IMPERSONATION - "hadoop.proxyuser.hive.hosts" = "*"; - "hadoop.proxyuser.hive.groups" = "*"; - - # ENABLE AUTHENTICATION - "hadoop.security.authentication" = "kerberos"; - "hadoop.security.authorization" = "true"; - "hadoop.rpc.protection" = "privacy"; - - "hadoop.security.auth_to_local" = '' - RULE:[2:$1/$2@$0]([ndj]n/.*@MY\.ENGINE)s/.*/hdfs/ - RULE:[2:$1/$2@$0]([rn]m/.*@MY\.ENGINE)s/.*/yarn/ - RULE:[2:$1/$2@$0](jhs/.*@MY\.ENGINE)s/.*/mapred/ - DEFAULT - ''; - }; - hdfsSite = { - # DATA - "dfs.namenode.name.dir" = "/hdfs/dfs/name"; - "dfs.datanode.data.dir" = "/hdfs/dfs/data"; - - # HDFS SECURITY - "dfs.block.access.token.enable" = "true"; - - # NAME NODE SECURITY - "dfs.namenode.keytab.file" = keytab_path; - "dfs.namenode.kerberos.principal" = "nn/my.engine@MY.ENGINE"; - "dfs.namenode.kerberos.internal.spnego.principal" = "HTTP/my.engine@MY.ENGINE"; - - # SECONDARY NAME NODE SECURITY - "dfs.secondary.namenode.keytab.file" = keytab_path; - "dfs.secondary.namenode.kerberos.principal" = "nn/my.engine@MY.ENGINE"; - "dfs.secondary.namenode.kerberos.internal.spnego.principal" = "HTTP/my.engine@MY.ENGINE"; - - # DATA NODE SECURITY - "dfs.datanode.keytab.file" = keytab_path; - "dfs.datanode.kerberos.principal" = "dn/my.engine@MY.ENGINE"; - - # WEBHDFS SECURITY - "dfs.webhdfs.enabled" = "true"; - - # WEB AUTHENTICATION CONFIG - "dfs.web.authentication.kerberos.principal" = "HTTP/my.engine@MY.ENGINE"; - "dfs.web.authentication.kerberos.keytab" = keytab_path; - "ignore.secure.ports.for.testing" = "true"; - "dfs.http.policy" = "HTTP_ONLY"; - "dfs.data.transfer.protection" = "privacy"; - - # ## MULTIHOMED - # "dfs.namenode.rpc-bind-host" = "0.0.0.0"; - # "dfs.namenode.servicerpc-bind-host" = "0.0.0.0"; - # "dfs.namenode.http-bind-host" = "0.0.0.0"; - # "dfs.namenode.https-bind-host" = "0.0.0.0"; - # "dfs.client.use.datanode.hostname" = "true"; # force connection by hostname - # "dfs.datanode.use.datanode.hostname" = "true"; # force connection by hostname - - - # "dfs.data.transfer.protection" = "privacy"; - # "hadoop.rpc.protection" = "privacy"; - # "dfs.http.policy" = "HTTP_ONLY"; - # "dfs.datanode.address" = "0.0.0.0:10019"; - # "dfs.datanode.http.address" = "0.0.0.0:10022"; - # "dfs.datanode.https.address" = "0.0.0.0:10023"; - - - # "dfs.datanode.kerberos.principal" = "dn/my.engine@MY.ENGINE"; - # "dfs.datanode.keytab.file" = keytab_path; - - # "dfs.namenode.kerberos.principal" = "nn/my.engine@MY.ENGINE"; - # "dfs.namenode.keytab.file" = keytab_path; - - # "dfs.block.access.token.enable" = "true"; - - - }; - yarnSite = { - # "yarn.acl.enable" = "true"; - # "yarn.admin.acl" = "*"; # TODO: restrict - }; - extraConfDirs = [ ]; - - hdfs = { - namenode = { enable = true; formatOnInit = true; restartIfChanged = true; }; - datanode = { enable = true; restartIfChanged = true; }; - journalnode = { enable = true; restartIfChanged = true; }; - zkfc = { enable = true; restartIfChanged = true; }; - httpfs = { enable = true; restartIfChanged = true; }; - }; - yarn = { resourcemanager.enable = true; nodemanager.enable = true; }; + "hadoop.security.auth_to_local" = '' + RULE:[2:$1/$2@$0]([ndj]n/.*@MY\.ENGINE)s/.*/hdfs/ + RULE:[2:$1/$2@$0]([rn]m/.*@MY\.ENGINE)s/.*/yarn/ + RULE:[2:$1/$2@$0](jhs/.*@MY\.ENGINE)s/.*/mapred/ + DEFAULT + ''; }; + hdfsSite = { + # DATA + "dfs.namenode.name.dir" = "/hdfs/dfs/name"; + "dfs.datanode.data.dir" = "/hdfs/dfs/data"; + + # HDFS SECURITY + "dfs.block.access.token.enable" = "true"; + + # NAME NODE SECURITY + "dfs.namenode.keytab.file" = keytab_path; + "dfs.namenode.kerberos.principal" = "nn/my.engine@MY.ENGINE"; + "dfs.namenode.kerberos.internal.spnego.principal" = "HTTP/my.engine@MY.ENGINE"; + + # SECONDARY NAME NODE SECURITY + "dfs.secondary.namenode.keytab.file" = keytab_path; + "dfs.secondary.namenode.kerberos.principal" = "nn/my.engine@MY.ENGINE"; + "dfs.secondary.namenode.kerberos.internal.spnego.principal" = "HTTP/my.engine@MY.ENGINE"; + + # DATA NODE SECURITY + "dfs.datanode.keytab.file" = keytab_path; + "dfs.datanode.kerberos.principal" = "dn/my.engine@MY.ENGINE"; + + # JOURNAL NODE SECURITY + "dfs.journalnode.keytab.file" = keytab_path; + "dfs.journalnode.kerberos.principal" = "jn/my.engine@MY.ENGINE"; + + # WEBHDFS SECURITY + "dfs.webhdfs.enabled" = "true"; + + # WEB AUTHENTICATION CONFIG + "dfs.web.authentication.kerberos.principal" = "HTTP/my.engine@MY.ENGINE"; + "dfs.web.authentication.kerberos.keytab" = keytab_path; + "ignore.secure.ports.for.testing" = "true"; + "dfs.http.policy" = "HTTP_ONLY"; + "dfs.data.transfer.protection" = "privacy"; + + # ## MULTIHOMED + # "dfs.namenode.rpc-bind-host" = "0.0.0.0"; + # "dfs.namenode.servicerpc-bind-host" = "0.0.0.0"; + # "dfs.namenode.http-bind-host" = "0.0.0.0"; + # "dfs.namenode.https-bind-host" = "0.0.0.0"; + # "dfs.client.use.datanode.hostname" = "true"; # force connection by hostname + # "dfs.datanode.use.datanode.hostname" = "true"; # force connection by hostname + }; + yarnSite = { + "yarn.nodemanager.admin-env" = "PATH=$PATH"; + "yarn.nodemanager.aux-services" = "mapreduce_shuffle"; + "yarn.nodemanager.aux-services.mapreduce_shuffle.class" = "org.apache.hadoop.mapred.ShuffleHandler"; + "yarn.nodemanager.bind-host" = "0.0.0.0"; + "yarn.nodemanager.container-executor.class" = "org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor"; + "yarn.nodemanager.env-whitelist" = "JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_HOME,LANG,TZ"; + "yarn.nodemanager.linux-container-executor.group" = "hadoop"; + "yarn.nodemanager.linux-container-executor.path" = "/run/wrappers/yarn-nodemanager/bin/container-executor"; + "yarn.nodemanager.log-dirs" = "/var/log/hadoop/yarn/nodemanager"; + "yarn.resourcemanager.bind-host" = "0.0.0.0"; + "yarn.resourcemanager.scheduler.class" = "org.apache.hadoop.yarn.server.resourcemanager.scheduler.fifo.FifoScheduler"; + + "yarn.resourcemanager.keytab" = keytab_path; + "yarn.resourcemanager.principal" = "rm/my.engine@MY.ENGINE"; + "yarn.nodemanager.keytab" = keytab_path; + "yarn.nodemanager.principal" = "nm/my.engine@MY.ENGINE"; + + # "yarn.nodemanager.container-executor.class" = "org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor"; + + "yarn.scheduler.capacity.root.queues" = "default"; + "yarn.scheduler.capacity.root.default.capacity" = 100; + # "yarn.scheduler.capacity.root.default.state" = "RUNNING"; + "yarn.scheduler.capacity.root.acl_submit_applications" = "hadoop,yarn,mapred,hdfs"; + }; + extraConfDirs = [ ]; + + hdfs = { + namenode = { enable = true; formatOnInit = true; restartIfChanged = true; }; + datanode = { enable = true; restartIfChanged = true; }; + journalnode = { enable = true; restartIfChanged = true; }; + zkfc = { enable = true; restartIfChanged = true; }; + httpfs = { enable = true; restartIfChanged = true; }; + }; + yarn = { + resourcemanager = { enable = true; restartIfChanged = true; }; + nodemanager = { enable = true; restartIfChanged = true; }; + }; + }; kerberos_server = { enable = true; @@ -236,8 +227,7 @@ in ''; }; - - + users.users.bertof.extraGroups = [ "hadoop" ]; systemd.services.spark-history = { path = with pkgs; [ procps openssh nettools ]; From 86a6fbac7679fe517a4d9eb7863de5b08d003624 Mon Sep 17 00:00:00 2001 From: Filippo Berto Date: Tue, 29 Mar 2022 09:47:57 +0200 Subject: [PATCH 08/12] Update big data config --- nixos/big_data.nix | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/nixos/big_data.nix b/nixos/big_data.nix index 3efcad5..7566112 100644 --- a/nixos/big_data.nix +++ b/nixos/big_data.nix @@ -1,11 +1,16 @@ { config, lib, pkgs, ... }: let keytab_path = /etc/hadoop.keytab; + pysparkPackageSelector = p: with p; [ numpy pyspark ]; + pysparkEnv = pkgs.python3.withPackages pysparkPackageSelector; hadoopConf = import { inherit pkgs lib; cfg = config.services.hadoop; }; hadoopConfDir = "${hadoopConf}/"; + spark = pkgs.spark.override { + extraPythonPackages = pysparkPackageSelector pkgs.python3.pkgs; + }; sparkConfDir = pkgs.stdenv.mkDerivation { name = "spark-config"; dontUnpack = true; @@ -31,7 +36,8 @@ let export JAVA_HOME="${pkgs.jdk8}" export SPARK_HOME="${pkgs.spark}/lib/${pkgs.spark.untarDir}" export SPARK_DIST_CLASSPATH=$(${pkgs.hadoop}/bin/hadoop classpath) - export PYSPARK_PYTHON="${pkgs.python3Packages.python}/bin/${pkgs.python3Packages.python.executable}" + export PYSPARK_PYTHON="${pysparkEnv.outPath}/bin/${pysparkEnv.executable}:" + export PYSPARK_DRIVER_PYTHON="${pysparkEnv.outPath}/bin/${pysparkEnv.executable}:" export PYTHONPATH="\$PYTHONPATH:$PYTHONPATH" export HADOOP_CONF_DIR="${hadoopConfDir}" export SPARKR_R_SHELL="${pkgs.R}/bin/R" @@ -42,8 +48,8 @@ let spark.eventLog.enabled true spark.eventLog.dir hdfs://localhost:/logs/spark spark.history.fs.logDirectory hdfs://localhost:/logs/spark - spark.yarn.keytab ${keytab_path} - spark.yarn.principal spark/my.engine@MY.ENGINE + # spark.yarn.keytab ${keytab_path} + # spark.yarn.principal spark/my.engine@MY.ENGINE STOP ''; }; @@ -63,19 +69,12 @@ in services = { spark = { - master = { - enable = true; - restartIfChanged = true; - }; - worker = { - enable = true; - restartIfChanged = true; - }; + package = spark; + master = { enable = true; restartIfChanged = true; }; + worker = { enable = true; restartIfChanged = true; }; confDir = sparkConfDir; }; - - hadoop = { coreSite = { "fs.defaultFS" = "hdfs://my.engine:8020"; @@ -188,11 +187,7 @@ in enable = true; realms."MY.ENGINE".acl = [ { principal = "*/admin"; access = "all"; } - { principal = "admin"; access = "all"; } - { principal = "*/localhost"; access = "all"; } { principal = "*/my.engine"; access = "all"; } - { principal = "nn/my.engine"; access = "all"; } - { principal = "hdfs"; access = "all"; } ]; }; }; From 52047104cc40f0feeab90eb33ffd9fd9755f5efa Mon Sep 17 00:00:00 2001 From: Filippo Berto Date: Wed, 30 Mar 2022 09:35:53 +0200 Subject: [PATCH 09/12] Fix spark python + spark history kerberos --- nixos/big_data.nix | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/nixos/big_data.nix b/nixos/big_data.nix index 7566112..eef0133 100644 --- a/nixos/big_data.nix +++ b/nixos/big_data.nix @@ -12,7 +12,7 @@ let extraPythonPackages = pysparkPackageSelector pkgs.python3.pkgs; }; sparkConfDir = pkgs.stdenv.mkDerivation { - name = "spark-config"; + name = "spark-conf"; dontUnpack = true; installPhase = '' # source standard environment @@ -36,8 +36,8 @@ let export JAVA_HOME="${pkgs.jdk8}" export SPARK_HOME="${pkgs.spark}/lib/${pkgs.spark.untarDir}" export SPARK_DIST_CLASSPATH=$(${pkgs.hadoop}/bin/hadoop classpath) - export PYSPARK_PYTHON="${pysparkEnv.outPath}/bin/${pysparkEnv.executable}:" - export PYSPARK_DRIVER_PYTHON="${pysparkEnv.outPath}/bin/${pysparkEnv.executable}:" + export PYSPARK_PYTHON="${pysparkEnv.outPath}/bin/${pysparkEnv.executable}" + export PYSPARK_DRIVER_PYTHON="${pysparkEnv.outPath}/bin/${pysparkEnv.executable}" export PYTHONPATH="\$PYTHONPATH:$PYTHONPATH" export HADOOP_CONF_DIR="${hadoopConfDir}" export SPARKR_R_SHELL="${pkgs.R}/bin/R" @@ -45,11 +45,17 @@ let STOP cat > $out/spark-defaults.conf <<- STOP - spark.eventLog.enabled true - spark.eventLog.dir hdfs://localhost:/logs/spark - spark.history.fs.logDirectory hdfs://localhost:/logs/spark - # spark.yarn.keytab ${keytab_path} - # spark.yarn.principal spark/my.engine@MY.ENGINE + spark.eventLog.enabled true + spark.eventLog.dir hdfs://localhost:/logs/spark + spark.history.fs.logDirectory hdfs://localhost:/logs/spark + # spark.yarn.keytab ${keytab_path} + # spark.yarn.principal spark/my.engine@MY.ENGINE + spark.history.ui.acls.enable true + spark.history.kerberos.enabled true + spark.history.kerberos.keytab ${keytab_path} + spark.history.kerberos.principal spark/my.engine@MY.ENGINE + spark.yarn.appMasterEnv.PYSPARK_PYTHON ${pysparkEnv.outPath}/bin/${pysparkEnv.executable} + spark.yarn.appMasterEnv.PYTHONPATH ${pysparkEnv.outPath}/lib/${pysparkEnv.executable}/site-packages STOP ''; }; @@ -174,7 +180,7 @@ in namenode = { enable = true; formatOnInit = true; restartIfChanged = true; }; datanode = { enable = true; restartIfChanged = true; }; journalnode = { enable = true; restartIfChanged = true; }; - zkfc = { enable = true; restartIfChanged = true; }; + zkfc = { enable = false; restartIfChanged = true; }; # ZOOKEEPER DISABLED, not using High Availability setup httpfs = { enable = true; restartIfChanged = true; }; }; yarn = { From 10c2df06ec13f1d27a201ec69329adb1e8e1d4da Mon Sep 17 00:00:00 2001 From: Filippo Berto Date: Wed, 30 Mar 2022 16:15:30 +0200 Subject: [PATCH 10/12] Update big data --- nixos/big_data.nix | 39 +++++++++++++++++++++++++++++---------- nixos/thor.nix | 23 ----------------------- 2 files changed, 29 insertions(+), 33 deletions(-) diff --git a/nixos/big_data.nix b/nixos/big_data.nix index eef0133..f77009e 100644 --- a/nixos/big_data.nix +++ b/nixos/big_data.nix @@ -1,6 +1,24 @@ { config, lib, pkgs, ... }: let - keytab_path = /etc/hadoop.keytab; + setup_scirpt = '' + sudo mkdir -p /hdfs + sudo chown -R hdfs:hadoop /hdfs + + for p in {nn,dn,jn,rm,nm,jhs,HTTP}; do + sudo kadmin.local -q "ank -randkey $p/my.engine"; + sudo kadmin.local -q "xst -k /etc/hadoop.keytab $p/my.engine"; + sudo kadmin.local -q "ktrem -k /etc/hadoop.keytab $p/my.engine old" + done + sudo chown hdfs:hadoop /etc/hadoop.keytab + + + sudo kadmin.local -q "ank -randkey spark/my.engine"; + sudo kadmin.local -q "xst -k /etc/spark.keytab spark/my.engine"; + sudo kadmin.local -q "ktrem -k /etc/spark.keytab spark/my.engine old" + sudo chown spark:spark /etc/spark.keytab + ''; + hadoop_keytab_path = "/etc/hadoop.keytab"; + spark_keytab_path = "/etc/spark.keytab"; pysparkPackageSelector = p: with p; [ numpy pyspark ]; pysparkEnv = pkgs.python3.withPackages pysparkPackageSelector; hadoopConf = import { @@ -48,11 +66,11 @@ let spark.eventLog.enabled true spark.eventLog.dir hdfs://localhost:/logs/spark spark.history.fs.logDirectory hdfs://localhost:/logs/spark - # spark.yarn.keytab ${keytab_path} + # spark.yarn.keytab ${hadoop_keytab_path} # spark.yarn.principal spark/my.engine@MY.ENGINE spark.history.ui.acls.enable true spark.history.kerberos.enabled true - spark.history.kerberos.keytab ${keytab_path} + spark.history.kerberos.keytab ${hadoop_keytab_path} spark.history.kerberos.principal spark/my.engine@MY.ENGINE spark.yarn.appMasterEnv.PYSPARK_PYTHON ${pysparkEnv.outPath}/bin/${pysparkEnv.executable} spark.yarn.appMasterEnv.PYTHONPATH ${pysparkEnv.outPath}/lib/${pysparkEnv.executable}/site-packages @@ -109,26 +127,27 @@ in # DATA "dfs.namenode.name.dir" = "/hdfs/dfs/name"; "dfs.datanode.data.dir" = "/hdfs/dfs/data"; + "dfs.journalnode.edits.dir" = "/hdfs/dfs/edits"; # HDFS SECURITY "dfs.block.access.token.enable" = "true"; # NAME NODE SECURITY - "dfs.namenode.keytab.file" = keytab_path; + "dfs.namenode.keytab.file" = hadoop_keytab_path; "dfs.namenode.kerberos.principal" = "nn/my.engine@MY.ENGINE"; "dfs.namenode.kerberos.internal.spnego.principal" = "HTTP/my.engine@MY.ENGINE"; # SECONDARY NAME NODE SECURITY - "dfs.secondary.namenode.keytab.file" = keytab_path; + "dfs.secondary.namenode.keytab.file" = hadoop_keytab_path; "dfs.secondary.namenode.kerberos.principal" = "nn/my.engine@MY.ENGINE"; "dfs.secondary.namenode.kerberos.internal.spnego.principal" = "HTTP/my.engine@MY.ENGINE"; # DATA NODE SECURITY - "dfs.datanode.keytab.file" = keytab_path; + "dfs.datanode.keytab.file" = hadoop_keytab_path; "dfs.datanode.kerberos.principal" = "dn/my.engine@MY.ENGINE"; # JOURNAL NODE SECURITY - "dfs.journalnode.keytab.file" = keytab_path; + "dfs.journalnode.keytab.file" = hadoop_keytab_path; "dfs.journalnode.kerberos.principal" = "jn/my.engine@MY.ENGINE"; # WEBHDFS SECURITY @@ -136,7 +155,7 @@ in # WEB AUTHENTICATION CONFIG "dfs.web.authentication.kerberos.principal" = "HTTP/my.engine@MY.ENGINE"; - "dfs.web.authentication.kerberos.keytab" = keytab_path; + "dfs.web.authentication.kerberos.keytab" = hadoop_keytab_path; "ignore.secure.ports.for.testing" = "true"; "dfs.http.policy" = "HTTP_ONLY"; "dfs.data.transfer.protection" = "privacy"; @@ -162,9 +181,9 @@ in "yarn.resourcemanager.bind-host" = "0.0.0.0"; "yarn.resourcemanager.scheduler.class" = "org.apache.hadoop.yarn.server.resourcemanager.scheduler.fifo.FifoScheduler"; - "yarn.resourcemanager.keytab" = keytab_path; + "yarn.resourcemanager.keytab" = hadoop_keytab_path; "yarn.resourcemanager.principal" = "rm/my.engine@MY.ENGINE"; - "yarn.nodemanager.keytab" = keytab_path; + "yarn.nodemanager.keytab" = hadoop_keytab_path; "yarn.nodemanager.principal" = "nm/my.engine@MY.ENGINE"; # "yarn.nodemanager.container-executor.class" = "org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor"; diff --git a/nixos/thor.nix b/nixos/thor.nix index 7b41f46..e681db1 100644 --- a/nixos/thor.nix +++ b/nixos/thor.nix @@ -172,29 +172,6 @@ # teamviewer.enable = true; }; - systemd.services.spark-history = { - path = with pkgs; [ procps openssh nettools ]; - description = "spark history service."; - after = [ "network.target" ]; - wantedBy = [ "multi-user.target" ]; - restartIfChanged = true; - environment = { - SPARK_CONF_DIR = sparkConfDir; - SPARK_LOG_DIR = "/var/log/spark"; - }; - serviceConfig = { - Type = "forking"; - User = "spark"; - Group = "spark"; - WorkingDirectory = "${pkgs.spark}/lib/${pkgs.spark.untarDir}"; - ExecStart = "${pkgs.spark}/lib/${pkgs.spark.untarDir}/sbin/start-history-server.sh"; - ExecStop = "${pkgs.spark}/lib/${pkgs.spark.untarDir}/sbin/stop-history-server.sh"; - TimeoutSec = 300; - StartLimitBurst = 10; - Restart = "always"; - }; - }; - services.teamviewer.enable = true; security = { From 2494d1e846e11d8a4a2f1e28c71ae8fd366705e1 Mon Sep 17 00:00:00 2001 From: Filippo Berto Date: Thu, 31 Mar 2022 10:40:39 +0200 Subject: [PATCH 11/12] Fix spark keytab pathFix spark keytab path --- nixos/big_data.nix | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nixos/big_data.nix b/nixos/big_data.nix index f77009e..7e30fad 100644 --- a/nixos/big_data.nix +++ b/nixos/big_data.nix @@ -66,11 +66,11 @@ let spark.eventLog.enabled true spark.eventLog.dir hdfs://localhost:/logs/spark spark.history.fs.logDirectory hdfs://localhost:/logs/spark - # spark.yarn.keytab ${hadoop_keytab_path} + # spark.yarn.keytab ${spark_keytab_path} # spark.yarn.principal spark/my.engine@MY.ENGINE spark.history.ui.acls.enable true spark.history.kerberos.enabled true - spark.history.kerberos.keytab ${hadoop_keytab_path} + spark.history.kerberos.keytab ${spark_keytab_path} spark.history.kerberos.principal spark/my.engine@MY.ENGINE spark.yarn.appMasterEnv.PYSPARK_PYTHON ${pysparkEnv.outPath}/bin/${pysparkEnv.executable} spark.yarn.appMasterEnv.PYTHONPATH ${pysparkEnv.outPath}/lib/${pysparkEnv.executable}/site-packages From 742e6625bce86752325e7bb69b84431e932cd3f9 Mon Sep 17 00:00:00 2001 From: Filippo Berto Date: Wed, 13 Apr 2022 19:23:32 +0200 Subject: [PATCH 12/12] Fix python path --- nixos/big_data.nix | 1 + 1 file changed, 1 insertion(+) diff --git a/nixos/big_data.nix b/nixos/big_data.nix index 7e30fad..a734dbc 100644 --- a/nixos/big_data.nix +++ b/nixos/big_data.nix @@ -74,6 +74,7 @@ let spark.history.kerberos.principal spark/my.engine@MY.ENGINE spark.yarn.appMasterEnv.PYSPARK_PYTHON ${pysparkEnv.outPath}/bin/${pysparkEnv.executable} spark.yarn.appMasterEnv.PYTHONPATH ${pysparkEnv.outPath}/lib/${pysparkEnv.executable}/site-packages + spark.executorEnv.PYSPARK_PYTHON ${pysparkEnv.outPath}/bin/${pysparkEnv.executable} STOP ''; };