guix-play/guix/scripts/offload.scm
Ludovic Courtès 74afca5dcf
offload: Gracefully report connection failures.
* guix/scripts/offload.scm (open-ssh-session): Check the return value of
'connect!'.  Call 'leave' when it's not 'ok.
2016-12-01 23:35:10 +01:00

693 lines
26 KiB
Scheme
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2014, 2015, 2016 Ludovic Courtès <ludo@gnu.org>
;;;
;;; This file is part of GNU Guix.
;;;
;;; GNU Guix is free software; you can redistribute it and/or modify it
;;; under the terms of the GNU General Public License as published by
;;; the Free Software Foundation; either version 3 of the License, or (at
;;; your option) any later version.
;;;
;;; GNU Guix is distributed in the hope that it will be useful, but
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;;; GNU General Public License for more details.
;;;
;;; You should have received a copy of the GNU General Public License
;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
(define-module (guix scripts offload)
#:use-module (ssh key)
#:use-module (ssh auth)
#:use-module (ssh session)
#:use-module (ssh channel)
#:use-module (ssh popen)
#:use-module (ssh dist)
#:use-module (ssh dist node)
#:use-module (ssh version)
#:use-module (guix config)
#:use-module (guix records)
#:use-module (guix store)
#:use-module (guix derivations)
#:use-module ((guix serialization)
#:select (nar-error? nar-error-file))
#:use-module (guix nar)
#:use-module (guix utils)
#:use-module ((guix build syscalls) #:select (fcntl-flock))
#:use-module ((guix build utils) #:select (which mkdir-p))
#:use-module (guix ui)
#:use-module (srfi srfi-1)
#:use-module (srfi srfi-11)
#:use-module (srfi srfi-26)
#:use-module (srfi srfi-34)
#:use-module (srfi srfi-35)
#:use-module (ice-9 popen)
#:use-module (ice-9 rdelim)
#:use-module (ice-9 match)
#:use-module (ice-9 regex)
#:use-module (ice-9 format)
#:use-module (ice-9 binary-ports)
#:export (build-machine
build-requirements
guix-offload))
;;; Commentary:
;;;
;;; Attempt to offload builds to the machines listed in
;;; /etc/guix/machines.scm, transferring missing dependencies over SSH, and
;;; retrieving the build output(s) over SSH upon success.
;;;
;;; This command should not be used directly; instead, it is called on-demand
;;; by the daemon, unless it was started with '--no-build-hook' or a client
;;; inhibited build hooks.
;;;
;;; Code:
(define-record-type* <build-machine>
build-machine make-build-machine
build-machine?
(name build-machine-name) ; string
(port build-machine-port ; number
(default 22))
(system build-machine-system) ; string
(user build-machine-user) ; string
(private-key build-machine-private-key ; file name
(default (user-openssh-private-key)))
(host-key build-machine-host-key) ; string
(daemon-socket build-machine-daemon-socket ; string
(default "/var/guix/daemon-socket/socket"))
(parallel-builds build-machine-parallel-builds ; number
(default 1))
(speed build-machine-speed ; inexact real
(default 1.0))
(features build-machine-features ; list of strings
(default '())))
(define-record-type* <build-requirements>
build-requirements make-build-requirements
build-requirements?
(system build-requirements-system) ; string
(features build-requirements-features ; list of strings
(default '())))
(define %machine-file
;; File that lists machines available as build slaves.
(string-append %config-directory "/machines.scm"))
(define (user-openssh-private-key)
"Return the user's default SSH private key, or #f if it could not be
determined."
(and=> (getenv "HOME")
(cut string-append <> "/.ssh/id_rsa")))
(define %user-module
;; Module in which the machine description file is loaded.
(let ((module (make-fresh-user-module)))
(module-use! module (resolve-interface '(guix scripts offload)))
module))
(define* (build-machines #:optional (file %machine-file))
"Read the list of build machines from FILE and return it."
(catch #t
(lambda ()
;; Avoid ABI incompatibility with the <build-machine> record.
(set! %fresh-auto-compile #t)
(save-module-excursion
(lambda ()
(set-current-module %user-module)
(primitive-load file))))
(lambda args
(match args
(('system-error . rest)
(let ((err (system-error-errno args)))
;; Silently ignore missing file since this is a common case.
(if (= ENOENT err)
'()
(leave (_ "failed to open machine file '~a': ~a~%")
file (strerror err)))))
(('syntax-error proc message properties form . rest)
(let ((loc (source-properties->location properties)))
(leave (_ "~a: ~a~%")
(location->string loc) message)))
(x
(leave (_ "failed to load machine file '~a': ~s~%")
file args))))))
(define (host-key->type+key host-key)
"Destructure HOST-KEY, an OpenSSH host key string, and return two values:
its key type as a symbol, and the actual base64-encoded string."
(define (type->symbol type)
(and (string-prefix? "ssh-" type)
(string->symbol (string-drop type 4))))
(match (string-tokenize host-key)
((type key _)
(values (type->symbol type) key))
((type key)
(values (type->symbol type) key))))
(define (private-key-from-file* file)
"Like 'private-key-from-file', but raise an error that 'with-error-handling'
can interpret meaningfully."
(catch 'guile-ssh-error
(lambda ()
(private-key-from-file file))
(lambda (key proc str . rest)
(raise (condition
(&message (message (format #f (_ "failed to load SSH \
private key from '~a': ~a")
file str))))))))
(define (open-ssh-session machine)
"Open an SSH session for MACHINE and return it. Throw an error on failure."
(let ((private (private-key-from-file* (build-machine-private-key machine)))
(public (public-key-from-file
(string-append (build-machine-private-key machine)
".pub")))
(session (make-session #:user (build-machine-user machine)
#:host (build-machine-name machine)
#:port (build-machine-port machine)
#:timeout 5 ;seconds
;; #:log-verbosity 'protocol
#:identity (build-machine-private-key machine)
;; We need lightweight compression when
;; exchanging full archives.
#:compression "zlib"
#:compression-level 3)))
(match (connect! session)
('ok
;; Authenticate the server. XXX: Guile-SSH 0.10.1 doesn't know about
;; ed25519 keys and 'get-key-type' returns #f in that case.
(let-values (((server) (get-server-public-key session))
((type key) (host-key->type+key
(build-machine-host-key machine))))
(unless (and (or (not (get-key-type server))
(eq? (get-key-type server) type))
(string=? (public-key->string server) key))
;; Key mismatch: something's wrong. XXX: It could be that the server
;; provided its Ed25519 key when we where expecting its RSA key.
(leave (_ "server at '~a' returned host key '~a' of type '~a' \
instead of '~a' of type '~a'~%")
(build-machine-name machine)
(public-key->string server) (get-key-type server)
key type)))
(let ((auth (userauth-public-key! session private)))
(unless (eq? 'success auth)
(disconnect! session)
(leave (_ "SSH public key authentication failed for '~a': ~a~%")
(build-machine-name machine) (get-error session))))
session)
(x
;; Connection failed or timeout expired.
(leave (_ "failed to connect to '~a': ~a~%")
(build-machine-name machine) (get-error session))))))
(define* (connect-to-remote-daemon session
#:optional
(socket-name "/var/guix/daemon-socket/socket"))
"Connect to the remote build daemon listening on SOCKET-NAME over SESSION,
an SSH session. Return a <nix-server> object."
(define redirect
;; Code run in SESSION to redirect the remote process' stdin/stdout to the
;; daemon's socket, à la socat. The SSH protocol supports forwarding to
;; Unix-domain sockets but libssh doesn't have an API for that, hence this
;; hack.
`(begin
(use-modules (ice-9 match) (rnrs io ports))
(let ((sock (socket AF_UNIX SOCK_STREAM 0))
(stdin (current-input-port))
(stdout (current-output-port)))
(setvbuf stdin _IONBF)
(setvbuf stdout _IONBF)
(connect sock AF_UNIX ,socket-name)
(let loop ()
(match (select (list stdin sock) '() (list stdin stdout sock))
((reads writes ())
(when (memq stdin reads)
(match (get-bytevector-some stdin)
((? eof-object?)
(primitive-exit 0))
(bv
(put-bytevector sock bv))))
(when (memq sock reads)
(match (get-bytevector-some sock)
((? eof-object?)
(primitive-exit 0))
(bv
(put-bytevector stdout bv))))
(loop))
(_
(primitive-exit 1)))))))
(let ((channel
(open-remote-pipe* session OPEN_BOTH
;; Sort-of shell-quote REDIRECT.
"guile" "-c"
(object->string
(object->string redirect)))))
(open-connection #:port channel)))
;;;
;;; Synchronization.
;;;
(define (lock-file file)
"Wait and acquire an exclusive lock on FILE. Return an open port."
(mkdir-p (dirname file))
(let ((port (open-file file "w0")))
(fcntl-flock port 'write-lock)
port))
(define (unlock-file lock)
"Unlock LOCK."
(fcntl-flock lock 'unlock)
(close-port lock)
#t)
(define-syntax-rule (with-file-lock file exp ...)
"Wait to acquire a lock on FILE and evaluate EXP in that context."
(let ((port (lock-file file)))
(dynamic-wind
(lambda ()
#t)
(lambda ()
exp ...)
(lambda ()
(unlock-file port)))))
(define-syntax-rule (with-machine-lock machine hint exp ...)
"Wait to acquire MACHINE's exclusive lock for HINT, and evaluate EXP in that
context."
(with-file-lock (machine-lock-file machine hint)
exp ...))
(define (machine-slot-file machine slot)
"Return the file name of MACHINE's file for SLOT."
;; For each machine we have a bunch of files representing each build slot.
;; When choosing a build machine, we attempt to get an exclusive lock on one
;; of these; if we fail, that means all the build slots are already taken.
;; Inspired by Nix's build-remote.pl.
(string-append (string-append %state-directory "/offload/"
(build-machine-name machine)
"/" (number->string slot))))
(define (acquire-build-slot machine)
"Attempt to acquire a build slot on MACHINE. Return the port representing
the slot, or #f if none is available.
This mechanism allows us to set a hard limit on the number of simultaneous
connections allowed to MACHINE."
(mkdir-p (dirname (machine-slot-file machine 0)))
(with-machine-lock machine 'slots
(any (lambda (slot)
(let ((port (open-file (machine-slot-file machine slot)
"w0")))
(catch 'flock-error
(lambda ()
(fcntl-flock port 'write-lock #:wait? #f)
;; Got it!
(format (current-error-port)
"process ~a acquired build slot '~a'~%"
(getpid) (port-filename port))
port)
(lambda args
;; PORT is already locked by another process.
(close-port port)
#f))))
(iota (build-machine-parallel-builds machine)))))
(define (release-build-slot slot)
"Release SLOT, a build slot as returned as by 'acquire-build-slot'."
(close-port slot))
;;;
;;; Offloading.
;;;
(define (build-log-port)
"Return the default port where build logs should be sent. The default is
file descriptor 4, which is open by the daemon before running the offload
hook."
(let ((port (fdopen 4 "w0")))
;; Make sure file descriptor 4 isn't closed when PORT is GC'd.
(set-port-revealed! port 1)
port))
(define* (transfer-and-offload drv machine
#:key
(inputs '())
(outputs '())
(max-silent-time 3600)
build-timeout
print-build-trace?)
"Offload DRV to MACHINE. Prior to the actual offloading, transfer all of
INPUTS to MACHINE; if building DRV succeeds, retrieve all of OUTPUTS from
MACHINE."
(define session
(open-ssh-session machine))
(define store
(connect-to-remote-daemon session
(build-machine-daemon-socket machine)))
(set-build-options store
#:print-build-trace print-build-trace?
#:max-silent-time max-silent-time
#:timeout build-timeout)
;; Protect DRV from garbage collection.
(add-temp-root store (derivation-file-name drv))
(send-files (cons (derivation-file-name drv) inputs)
store)
(format (current-error-port) "offloading '~a' to '~a'...~%"
(derivation-file-name drv) (build-machine-name machine))
(format (current-error-port) "@ build-remote ~a ~a~%"
(derivation-file-name drv) (build-machine-name machine))
(guard (c ((nix-protocol-error? c)
(format (current-error-port)
(_ "derivation '~a' offloaded to '~a' failed: ~a~%")
(derivation-file-name drv)
(build-machine-name machine)
(nix-protocol-error-message c))
;; Use exit code 100 for a permanent build failure. The daemon
;; interprets other non-zero codes as transient build failures.
(primitive-exit 100)))
(build-derivations store (list drv)))
(retrieve-files outputs store)
(format (current-error-port) "done with offloaded '~a'~%"
(derivation-file-name drv)))
(define (store-import-channel session)
"Return an output port to which archives to be exported to SESSION's store
can be written."
;; Using the 'import-paths' RPC on a remote store would be slow because it
;; makes a round trip every time 32 KiB have been transferred. This
;; procedure instead opens a separate channel to use the remote
;; 'import-paths' procedure, which consumes all the data in a single round
;; trip.
(define import
`(begin
(use-modules (guix))
(with-store store
(setvbuf (current-input-port) _IONBF)
(import-paths store (current-input-port)))))
(open-remote-output-pipe session
(string-join
`("guile" "-c"
,(object->string
(object->string import))))))
(define (store-export-channel session files)
"Return an input port from which an export of FILES from SESSION's store can
be read."
;; Same as above: this is more efficient than calling 'export-paths' on a
;; remote store.
(define export
`(begin
(use-modules (guix))
(with-store store
(setvbuf (current-output-port) _IONBF)
(export-paths store ',files (current-output-port)))))
(open-remote-input-pipe session
(string-join
`("guile" "-c"
,(object->string
(object->string export))))))
(define (send-files files remote)
"Send the subset of FILES that's missing to REMOTE, a remote store."
(with-store store
;; Compute the subset of FILES missing on SESSION and send them.
(let* ((session (channel-get-session (nix-server-socket remote)))
(node (make-node session))
(missing (node-eval node
`(begin
(use-modules (guix)
(srfi srfi-1) (srfi srfi-26))
(with-store store
(remove (cut valid-path? store <>)
',files)))))
(port (store-import-channel session)))
(format #t (_ "sending ~a store files to '~a'...~%")
(length missing) (session-get session 'host))
;; Send MISSING in topological order.
(export-paths store missing port)
;; Tell the remote process that we're done. (In theory the
;; end-of-archive mark of 'export-paths' would be enough, but in
;; practice it's not.)
(channel-send-eof port)
;; Wait for completion of the remote process.
(let ((result (zero? (channel-get-exit-status port))))
(close-port port)
result))))
(define (retrieve-files files remote)
"Retrieve FILES from SESSION's store, and import them."
(let* ((session (channel-get-session (nix-server-socket remote)))
(host (session-get session 'host))
(port (store-export-channel session files)))
(format #t (_ "retrieving ~a files from '~a'...~%")
(length files) host)
;; We cannot use the 'import-paths' RPC here because we already
;; hold the locks for FILES.
(let ((result (restore-file-set port
#:log-port (current-error-port)
#:lock? #f)))
(close-port port)
result)))
;;;
;;; Scheduling.
;;;
(define (machine-matches? machine requirements)
"Return #t if MACHINE matches REQUIREMENTS."
(and (string=? (build-requirements-system requirements)
(build-machine-system machine))
(lset<= string=?
(build-requirements-features requirements)
(build-machine-features machine))))
(define (machine-load machine)
"Return the load of MACHINE, divided by the number of parallel builds
allowed on MACHINE."
;; Note: This procedure is costly since it creates a new SSH session.
(let* ((session (open-ssh-session machine))
(pipe (open-remote-pipe* session OPEN_READ
"cat" "/proc/loadavg"))
(line (read-line pipe)))
(close-port pipe)
(if (eof-object? line)
+inf.0 ;MACHINE does not respond, so assume it is infinitely loaded
(match (string-tokenize line)
((one five fifteen . _)
(let* ((raw (string->number five))
(jobs (build-machine-parallel-builds machine))
(normalized (/ raw jobs)))
(format (current-error-port) "load on machine '~a' is ~s\
(normalized: ~s)~%"
(build-machine-name machine) raw normalized)
normalized))
(_
+inf.0))))) ;something's fishy about MACHINE, so avoid it
(define (machine-lock-file machine hint)
"Return the name of MACHINE's lock file for HINT."
(string-append %state-directory "/offload/"
(build-machine-name machine)
"." (symbol->string hint) ".lock"))
(define (machine-choice-lock-file)
"Return the name of the file used as a lock when choosing a build machine."
(string-append %state-directory "/offload/machine-choice.lock"))
(define %slots
;; List of acquired build slots (open ports).
'())
(define (choose-build-machine machines)
"Return the best machine among MACHINES, or #f."
;; Proceed like this:
;; 1. Acquire the global machine-choice lock.
;; 2. For all MACHINES, attempt to acquire a build slot, and filter out
;; those machines for which we failed.
;; 3. Choose the best machine among those that are left.
;; 4. Release the previously-acquired build slots of the other machines.
;; 5. Release the global machine-choice lock.
(with-file-lock (machine-choice-lock-file)
(define machines+slots+loads
(filter-map (lambda (machine)
;; Call 'machine-load' from here to make sure it is called
;; only once per machine (it is expensive).
(let ((slot (acquire-build-slot machine)))
(and slot
(list machine slot (machine-load machine)))))
machines))
(define (undecorate pred)
(lambda (a b)
(match a
((machine1 slot1 load1)
(match b
((machine2 slot2 load2)
(pred machine1 load1 machine2 load2)))))))
(define (machine-less-loaded-or-faster? m1 l1 m2 l2)
;; Return #t if M1 is either less loaded or faster than M2, with L1
;; being the load of M1 and L2 the load of M2. (This relation defines a
;; total order on machines.)
(> (/ (build-machine-speed m1) (+ 1 l1))
(/ (build-machine-speed m2) (+ 1 l2))))
(let loop ((machines+slots+loads
(sort machines+slots+loads
(undecorate machine-less-loaded-or-faster?))))
(match machines+slots+loads
(((best slot load) others ...)
;; Return the best machine unless it's already overloaded.
(if (< load 2.)
(match others
(((machines slots loads) ...)
;; Release slots from the uninteresting machines.
(for-each release-build-slot slots)
;; Prevent SLOT from being GC'd.
(set! %slots (cons slot %slots))
best))
(begin
;; BEST is overloaded, so try the next one.
(release-build-slot slot)
(loop others))))
(() #f)))))
(define* (process-request wants-local? system drv features
#:key
print-build-trace? (max-silent-time 3600)
build-timeout)
"Process a request to build DRV."
(let* ((local? (and wants-local? (string=? system (%current-system))))
(reqs (build-requirements
(system system)
(features features)))
(candidates (filter (cut machine-matches? <> reqs)
(build-machines))))
(match candidates
(()
;; We'll never be able to match REQS.
(display "# decline\n"))
((_ ...)
(let ((machine (choose-build-machine candidates)))
(if machine
(begin
;; Offload DRV to MACHINE.
(display "# accept\n")
(let ((inputs (string-tokenize (read-line)))
(outputs (string-tokenize (read-line))))
(transfer-and-offload drv machine
#:inputs inputs
#:outputs outputs
#:max-silent-time max-silent-time
#:build-timeout build-timeout
#:print-build-trace? print-build-trace?)))
;; Not now, all the machines are busy.
(display "# postpone\n")))))))
;;;
;;; Entry point.
;;;
(define (guix-offload . args)
(define request-line-rx
;; The request format. See 'tryBuildHook' method in build.cc.
(make-regexp "([01]) ([a-z0-9_-]+) (/[[:graph:]]+.drv) ([[:graph:]]*)"))
(define not-coma
(char-set-complement (char-set #\,)))
;; Make sure $HOME really corresponds to the current user. This is
;; necessary since lsh uses that to determine the location of the yarrow
;; seed file, and fails if it's owned by someone else.
(and=> (passwd:dir (getpw (getuid)))
(cut setenv "HOME" <>))
;; We rely on protocol-level compression from libssh to optimize large data
;; transfers. Warn if it's missing.
(unless (zlib-support?)
(warning (_ "Guile-SSH lacks zlib support"))
(warning (_ "data transfers will *not* be compressed!")))
(match args
((system max-silent-time print-build-trace? build-timeout)
(let ((max-silent-time (string->number max-silent-time))
(build-timeout (string->number build-timeout))
(print-build-trace? (string=? print-build-trace? "1")))
(parameterize ((%current-system system))
(let loop ((line (read-line)))
(unless (eof-object? line)
(cond ((regexp-exec request-line-rx line)
=>
(lambda (match)
(with-error-handling
(process-request (equal? (match:substring match 1) "1")
(match:substring match 2) ; system
(call-with-input-file
(match:substring match 3)
read-derivation)
(string-tokenize
(match:substring match 4) not-coma)
#:print-build-trace? print-build-trace?
#:max-silent-time max-silent-time
#:build-timeout build-timeout))))
(else
(leave (_ "invalid request line: ~s~%") line)))
(loop (read-line)))))))
(("--version")
(show-version-and-exit "guix offload"))
(("--help")
(format #t (_ "Usage: guix offload SYSTEM PRINT-BUILD-TRACE
Process build offload requests written on the standard input, possibly
offloading builds to the machines listed in '~a'.~%")
%machine-file)
(display (_ "
This tool is meant to be used internally by 'guix-daemon'.\n"))
(show-bug-report-information))
(x
(leave (_ "invalid arguments: ~{~s ~}~%") x))))
;;; Local Variables:
;;; eval: (put 'with-machine-lock 'scheme-indent-function 2)
;;; eval: (put 'with-file-lock 'scheme-indent-function 1)
;;; eval: (put 'with-error-to-port 'scheme-indent-function 1)
;;; End:
;;; offload.scm ends here