crossfire/crossfire/server.rkt

#lang racket/base
;; crossfire: distributed brute force infrastructure
;;
;; Copyright (C) 2020 haskal
;;
;; This program is free software: you can redistribute it and/or modify
;; it under the terms of the GNU Affero General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;;
;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU Affero General Public License for more details.
;;
;; You should have received a copy of the GNU Affero General Public License
;; along with this program.  If not, see <https://www.gnu.org/licenses/>.

(require db/base db/sqlite3
         data/queue racket/async-channel racket/bool racket/contract racket/fasl racket/file
         racket/function racket/list racket/logging racket/match racket/path racket/random
         racket/runtime-path racket/set racket/string racket/unit
         north/base north/adapter/base north/adapter/sqlite
         "comms.rkt" "info.rkt" "logging.rkt" "manifest.rkt" "not-crypto.rkt" "pattern.rkt"
         "protocol.rkt"
         (submod "static-support.rkt" misc-calls))

;; logging
(define-logger server #:parent global-logger)

;; configuration

(define *server-node-id* 0)

(define *production?* #f)

;; XXX : platform-specific
(define *config-root* (if *production?* "/etc/" "etc/"))
(define *state-root* (if *production?* "/var/lib/crossfire/" "lib/"))
(define *lib-root* (if *production?* "/usr/lib/" "lib/"))

(define (get-binary-path-for-arch arch)
  (build-path *lib-root* (string-append "arch_" arch) "crossfire-agent"))

(define *server-config-path* (build-path *config-root* "crossfire.rktd"))
(define *server-db-path* (build-path *state-root* "crossfire.sqlite"))
(define *server-files-path* (build-path *state-root* "projects/"))
(define *server-seckey-path* (build-path *state-root* "server.key"))

(define (get-project-file-path projid)
  (build-path *server-files-path* (number->string projid)))


;; comms node for server (without secret key)
(define current-server-public-node (make-parameter #f))

;; north migrations
(define-runtime-path migrations-dir "migrations/")

;; database

(define current-db (make-parameter #f))

(define (open-server-db [mode 'read/write])
  (let ([db (sqlite3-connect #:database *server-db-path* #:mode mode)])
    (query-exec db "pragma foreign_keys=1;")
    db))

;; this allows the server to be capable of migrating itself
(define (migrate-server-db [db (current-db)])
  ;; these are the steps taken by the north cli tool (it's a bit verbose but at least it's all
  ;; programmatically accessible...)
  ;; i also use a runtime path to be a bit more robust
  (define base (path->migration migrations-dir))
  (define adapter (sqlite-adapter db))
  (adapter-init adapter)
  (define current-revision (adapter-current-revision adapter))
  (define target-revision (migration-revision (migration-most-recent base)))
  (define plan (migration-plan base current-revision target-revision))
  ;; TODO : automatically back up old db if there are migrations to apply
  ;; or idk just ask the user to make backups tbh
  (call-with-transaction db (lambda ()
    (for ([migration (in-list plan)])
      (log-server-info "applying migration: ~a" (migration-revision migration))
      (adapter-apply! adapter (migration-revision migration) (migration-up migration)))))
  (void))

(define-syntax-rule (define-stmt name what)
  (define name (virtual-statement what)))

(define-stmt q-new-node "insert into node (name, arch, type, secret) values (?, ?, ?, ?)")
(define-stmt q-add-node-res "insert or ignore into node_resource (nodeid, resource) values (?, ?)")
(define-stmt q-del-node-res "delete from node_resource where nodeid=? and resource=?")
(define-stmt q-get-nodes "select id, name, arch from node where type=?")
(define-stmt q-get-all-resources
  "select nodeid, resource from node_resource inner join node on node.id = node_resource.nodeid
       where node.type = ?")
(define-stmt q-get-node-resources "select resource from node_resource where nodeid=?")
(define-stmt q-edit-node "update node set name=? where id=?")
(define-stmt q-delete-node "delete from node where id=?")
(define-stmt q-delete-node-type "delete from node where type=?")
(define-stmt q-get-node-type "select type from node where id=?")
(define-stmt q-get-node-info "select name, arch, type, secret from node where id=?")

(define-stmt q-new-task "insert into task (name, manifest) values (?, ?)")
(define-stmt q-get-task-id-commit "select id, committed from task")
(define-stmt q-set-task-commit "update task set committed=1 where id=?")
(define-stmt q-delete-task "delete from task where id=?")
(define-stmt q-get-tasks "select id, name, manifest, complete from task")
(define-stmt q-set-task-complete "update task set complete=1 where id=?")
(define-stmt q-task-exists "select 1 from task where id=?")

(define-stmt q-get-task-log
  "select worker, time_wall_start, duration, pattern from task_log where taskid=?")
(define-stmt q-add-task-log
  "insert into task_log (taskid, worker, time_wall_start, duration, pattern)
       values (?, ?, ?, ?, ?)")
(define-stmt q-add-task-match
  "insert into task_match (taskid, worker, time_wall, match) values (?, ?, ?, ?)")
(define-stmt q-get-task-matches
  "select worker, time_wall, match from task_match where taskid=?")
(define-stmt q-count-task-match
  "select count(*) from task_match where taskid=?")

;; utils

(define (query/insert-id db stmt . args)
  (define info (simple-result-info (apply query db stmt args)))
  (cdr (assoc 'insert-id info)))

;; cleanly recovers from potential crash situations
(define (server-cleanup-unused-files)
  (define existing-ids (mutable-set))
  (call-with-transaction (current-db) (lambda ()
    (define (cleanup id exists? path)
      (log-server-warning "removing corrupted/incomplete task ~a" id)
      (when exists? (delete-file path))
      (query-exec (current-db) q-delete-task id))
    (for ([(id committed) (in-query (current-db) q-get-task-id-commit)])
      (set-add! existing-ids (number->string id))
      (define path (get-project-file-path id))
      (define exists? (file-exists? path))
      (define committed? (= 1 committed))
      (cond
        ;; potentially crashed while file was fsync'd but the directory was not
        [(and committed? (not exists?)) (cleanup id exists? path)]
        ;; crashed between row insert and file fsync
        [(not committed?) (cleanup id exists? path)]
        [else (void)]))))
  ;; delete any unaffiliated files
  (for ([subpath (in-directory *server-files-path*)])
    (define name (path->string (file-name-from-path subpath)))
    (unless (set-member? existing-ids name)
      (delete-file subpath)))
  (void))

;; commits a file corresponding to the task
(define (server-commit-file taskid data)
  (define path (get-project-file-path taskid))
  (call-with-output-file path
    (lambda (out)
      (write-bytes data out)
      (port-fsync out))
    #:mode 'binary
    #:exists 'truncate)
  (query-exec (current-db) q-set-task-commit taskid))

;; computes a hash of the file identifying its current contents for agents
;; (in case we reuse taskids)
(define (server-hash-file taskid)
  (file->blake2b-hash (get-project-file-path taskid)))

(define (server-get-file taskid)
  (define path (get-project-file-path taskid))
  (file->bytes path))


;; rpc helpers

(define (get-nodes type)
  (define type-str (symbol->string type))
  (define resources (rows->dict #:key "nodeid" #:value "resource" #:value-mode '(list)
                                (query (current-db) q-get-all-resources type-str)))
  (for/list ([(id name arch) (in-query (current-db) q-get-nodes type-str)])
    (define online? (and (current-comms) (comms-channel-available? (current-comms) id)))
    (node-info id name arch type (hash-ref resources id '()) online?)))

(define (make-node name arch type resources)
  (call-with-transaction (current-db) (lambda ()
    (define secret (crypto-sign-make-key))
    (define public (crypto-sign-public-key secret))
    (define id (query/insert-id (current-db) q-new-node name arch (symbol->string type) secret))
    (for ([res (in-list resources)])
      (query-exec (current-db) q-add-node-res id res))
    (values id public))))

(define (load-comms-node id [with-secret? #f] [with-arch? #f])
  (match (query-maybe-row (current-db) q-get-node-info id)
    [(vector name arch type secret)
     (define node-val
       (node id name (string->symbol type) (crypto-sign-public-key secret)
             (and with-secret? secret) #f #f))
     (if with-arch?
       (values node-val arch)
       node-val)]
    [_ (error "invalid node id" id)]))

(define (restore-comms-node id)
  (comms-set-node-info (current-comms) (load-comms-node id)))

(define (configure-agent-binary agent-node agent-arch server-node)
  (define binary
    (file->bytes
      (get-binary-path-for-arch agent-arch)))

  ;; XXX : platform-specific behavior
  (define (configure.linux-gnu)
    (define trailing-data (s-exp->fasl (list agent-node server-node)))
    ;; write 32 bit unsigned big endian trailer size (including size)
    (define trailing-size
      (integer->integer-bytes (+ 4 (bytes-length trailing-data)) 4 #f #t))
    (bytes-append binary trailing-data trailing-size))

  (match (string-split agent-arch "-")
    [(list _ ... "linux" "gnu") (configure.linux-gnu)]
    [_ (error "XXX: don't know how to configure arch" agent-arch)]))

;; manifest is the raw form
(define (make-task manifest tar)
  (define manifest-data (s-exp->fasl manifest))
  (define name (second (assoc 'name manifest)))
  (define id (query/insert-id (current-db) q-new-task name manifest-data))
  (server-commit-file id tar)
  id)

(define (enforce-subject type)
  ;; override if the from-node is us
  (unless (or (symbol=? type (node-type (current-from-node)))
              (= (node-id (current-from-node)) *server-node-id*))
    (error "unauthorized")))

(define (enforce-object id type)
  (match (query-maybe-value (current-db) q-get-node-type id)
    [#f (error "node doesn't exist" id)]
    [(== (symbol->string type)) (void)]
    [x (error "wrong node type" x)]))


;; client rpcs

(define (get-agents)
  (enforce-subject 'client)
  (get-nodes 'agent))

(define/contract (new-agent name arch resources)
  (-> string? string? (listof string?) integer?)
  (enforce-subject 'client)
  (define-values [id public] (make-node name arch 'agent resources))
  (define comms-node (node id name 'agent public #f #f #f))
  (comms-set-node-info (current-comms) comms-node)
  (agent-handler-new-agent id arch resources)
  id)

(define/contract (edit-agent id name resources)
  (-> integer? string? (listof string?) void?)
  (enforce-subject 'client)
  (call-with-transaction (current-db) (lambda ()
    (enforce-object id 'agent)
    (define existing-resource (list->set (query-list (current-db) q-get-node-resources id)))
    (define new-resource (list->set resources))
    (query-exec (current-db) q-edit-node name id)
    (for ([res (in-set (set-subtract new-resource existing-resource))])
      (query-exec (current-db) q-add-node-res id res))
    (for ([res (in-set (set-subtract existing-resource new-resource))])
      (query-exec (current-db) q-del-node-res id res))))
  (define-values [comms-node arch] (load-comms-node id #f #t))
  (comms-set-node-info (current-comms) comms-node)
  (agent-handler-delete-agent id)
  (agent-handler-new-agent id arch resources)
  (void))

(define/contract (get-agent-deployment id)
  (-> integer? bytes?)
  ;; TODO : streaming interface
  (enforce-subject 'client)
  (define-values [agent-node arch] (load-comms-node id #t #t))
  (match (node-type agent-node)
    ['agent (configure-agent-binary agent-node arch (current-server-public-node))]
    [_ (error "invalid node type")]))

(define/contract (delete-agent id)
  (-> integer? void?)
  (enforce-subject 'client)
  (call-with-transaction (current-db) (lambda ()
    (enforce-object id 'agent)
    (with-handlers
      ([exn:fail:sql?
         (lambda (_)
           (error "failed to delete agent, perhaps there are some projects referencing it?"))])
      (query-exec (current-db) q-delete-node id))))
  (comms-delete-node (current-comms) id)
  (agent-handler-delete-agent id)
  (void))

;; client rpcs :: projects

(define/contract (new-project manifest tar)
  (-> list? bytes? integer?)
  ;; TODO : streaming interface
  (enforce-subject 'client)
  ;; check validity
  (define mf-parsed (parse-manifest manifest))
  (define id (make-task manifest tar))
  ;; notify agent handler
  (agent-handler-new-task id mf-parsed)
  id)

(define (get-projects)
  (enforce-subject 'client)

  (define (get-progress id mf)
    (define parsed-mf (parse-manifest mf))
    (define total-size (manifest-psize parsed-mf))
    (define completed-size
      (for/sum ([(_1 _2 _3 pat-fasl) (in-query (current-db) q-get-task-log id)])
        (define sub (make-integer-set (fasl->s-exp pat-fasl)))
        (integer-set-count sub)))
    (/ completed-size total-size))

  (for/list ([(id name manifest complete?) (in-query (current-db) q-get-tasks)])
    (define mf (fasl->s-exp manifest))
    ;; XXX this should be a join t b h
    (define matches (query-value (current-db) q-count-task-match id))
    (define progress
      (if (zero? complete?)
        ;; XXX this could also _maybe_ be a join but idk
        (get-progress id mf)
        1.0))
    (project-info id name mf progress matches)))

(define/contract (get-project-file taskid)
  (-> integer? bytes?)
  ;; TODO : streaming interface
  (with-handlers ([exn:fail? (lambda (ex) (error "unable to fetch the requested file"))])
    (server-get-file taskid)))

(define/contract (get-project-matches taskid)
  (-> integer? (listof (listof integer?)))
  (for/list ([(worker time-wall match-fasl) (in-query (current-db) q-get-task-matches taskid)])
    (fasl->s-exp match-fasl)))

(define/contract (delete-project taskid)
  (-> integer? void?)
  (enforce-subject 'client)
  (call-with-transaction (current-db) (lambda ()
    (define exist (query-maybe-value (current-db) q-task-exists taskid))
    (when exist
      (agent-handler-cancel-task taskid)
      (query-exec (current-db) q-delete-task taskid))))
  (define path (get-project-file-path taskid))
  (when (file-exists? path)
    (delete-file path)))

;; agent handling

;; minimum batch of inputs that will be scheduled for a node
;; the batch size scales to how fast the node completes a batch
(define *min-subtask-size* 16)
;; aim to batch every 5 minutes
;; if a task takes shorter, the amount will be doubled, and if it takes longer it will be halved
(define *optimal-completion-secs* 300)
;; tasks will be reassigned if not completed within this time
(define *subtask-timeout* (* 3 *optimal-completion-secs*))

;; constatns for agent rpc retries
(define *min-retry-delay* 2)
(define *max-retry-delay* 120)
(define *retry-delay-ratio* 2)

(define (agent-handler)
  ;; unlike comms, messages to agent-handler have no responses. just thread-send, it's one-way
  (define cust (make-custodian))
  (define this-thread (current-thread))
  (current-agent-handler this-thread)
  ;; make an auto cleanup thread :P
  (thread (lambda () (thread-wait this-thread) (custodian-shutdown-all cust)))
  ;; setup agent rpcs
  (define agent-wrapper@ (make-rpc-wrapper-unit agent^))
  (define-values/invoke-unit agent-wrapper@ (import) (export agent^))

  ;; tasks
  ;; semaphore guarding mutation (sigh)
  ;; manifest: task manifest (parsed version)
  ;; work-pattern: the integer-set representing the work left to do
  ;; agent-todo: hash of agent id to integer-set representing work the agent is working on
  ;; file-hash: the hash to send to agents to identify the contents of the project file more
  ;; precisely than just the taskid, which allows them to cache the file locally
  ;; completed-work: an integer set of completed work
  (struct task-state [id sema manifest [work-pattern #:mutable] agent-todo file-hash
                      [completed-work #:mutable]] #:transparent)

  (define (initialize-task id mf)
    (define file-hash (server-hash-file id))
    (define sema (make-semaphore 1))
    (define agent-todo (make-hash))
    (define init-pattern-range (range->integer-set 0 (sub1 (manifest-psize mf))))
    ;; subtract the pattern ranges that were already logged as complete
    (define-values [pattern-range completed-work]
      (for/fold ([pattern-range init-pattern-range]
                 [completed-work (make-integer-set '())])
                ([(_1 _2 _3 pat-fasl) (in-query (current-db) q-get-task-log id)])
        (define sub (make-integer-set (fasl->s-exp pat-fasl)))
        (values (integer-set-subtract pattern-range sub) (integer-set-union completed-work sub))))
    (task-state id sema mf pattern-range agent-todo file-hash completed-work))

  (define (task-has-work? ts)
    (not (and (zero? (integer-set-count (task-state-work-pattern ts)))
              (hash-empty? (task-state-agent-todo ts)))))

  (define (task-set-complete! ts)
    (log-server-info "fully completed task: ~a" (task-state-id ts))
    (query-exec (current-db) q-set-task-complete (task-state-id ts))
    ;; TODO : notification mechanism
    (handle-stop-task (task-state-id ts)))

  ;; this doesn't update the database - that only gets updated when the work is complete
  (define (task-assign! ts agent-id requested-amount)
    (unless (positive? requested-amount) (error "requested amount must be positive"))
    (call-with-semaphore (task-state-sema ts) (lambda ()
      (when (hash-has-key? (task-state-agent-todo ts) agent-id)
        (error "agent already has work assigned"))
      (define-values [assignment new-wp]
        (pattern-range-take (task-state-work-pattern ts) requested-amount))
      (cond
        ;; done! (maybe)
        [(zero? (integer-set-count assignment))
         (define at (task-state-agent-todo ts))
         (cond
           [(hash-empty? at) ;; actually done. cancel all in-progress assignments and celebrate uwu
            ;; are we going to hold up literally everything because we're still holding this
            ;;semaphore during a database write?
            ;; probably
            ;; does it actually matter?
            ;; probably not
            (task-set-complete! ts)
            #f]
           [else ;; steal work lol
            ;; this will massively overcommit the last few parts of a project and potentially
            ;; prioritize doing useless duplicate work instead of moving on to the next project
            ;; but it'll be fiiiiiine don't worry
            (define wp (for/fold ([iset (make-integer-set '())]) ([(_ v) (in-hash at)])
              (integer-set-union iset v)))
            (define-values [assignment _]
              (pattern-range-take (task-state-work-pattern ts) requested-amount))
            (hash-set! at agent-id assignment)
            assignment])]
        ;; update tracking
        [else
          (hash-set! (task-state-agent-todo ts) agent-id assignment)
          (set-task-state-work-pattern! ts new-wp)
          assignment]))))

  ;; returns work from agent back to the regular work pool
  (define (task-unassign! ts agent-id)
    (call-with-semaphore (task-state-sema ts) (lambda ()
      (match (hash-ref (task-state-agent-todo ts) agent-id #f)
        [#f (void)]
        [assignment
         (define new-wp (integer-set-union assignment (task-state-work-pattern ts)))
         (set-task-state-work-pattern!
           ts (integer-set-subtract new-wp (task-state-completed-work ts)))
         (hash-remove! (task-state-agent-todo ts) agent-id)]))))

  ;; adds to task log, then updates work pool with task completion
  (define (task-complete! ts agent-id time-wall-start duration)
    (call-with-semaphore (task-state-sema ts) (lambda ()
      (match (hash-ref (task-state-agent-todo ts) agent-id #f)
        [#f (void)]
        [assignment
         ;; add a new work log
         (query-exec
           (current-db) q-add-task-log (task-state-id ts) agent-id time-wall-start duration
           (s-exp->fasl (integer-set-contents assignment)))
         (define new-completed (integer-set-union (task-state-completed-work ts) assignment))
         (set-task-state-completed-work! ts new-completed)
         ;; remove tracking - this work is now done
         (hash-remove! (task-state-agent-todo ts) agent-id)
         ;; check if we're fully complete. if so, mark the task complete in the database and cancel
         ;; all related assignments
         (unless (task-has-work? ts)
           (task-set-complete! ts))]))))

  (define (agent-thd id arch resources-in msg-chan)
    ;; initialize to-node for rpcs
    (current-to-node (comms-get-node-info (current-comms) id))
    ;; helper to generate assignment ids (a kind of arbitrary number we pass to agents to track
    ;; what they're currently working on)
    (define make-assignment-id
      (let ([counter (random 0 1000)])
        (lambda () (begin0 counter (set! counter (add1 counter))))))
    ;; set of resources
    (define resources (list->set resources-in))
    (set! resources-in #f)

    ;; tracks the sizes of task pieces given to the agent
    ;; task id to count of assignment size
    (define task-size (make-hash))
    ;; current tasks, mirrors parent but only contains tasks we have resources for
    (define current-tasks (make-hash))

    ;; assignments
    (struct assignment [id taskid start-time-utc start-time-monotonic] #:transparent)
    ;; active assignments, by assignment id
    (define assigned-tasks (make-hash))

    ;; keep running?
    (define run-agent-thd? #t)

    (define (handle-cleanup)
      ;; unassign all tasks
      (for ([(aid av) (in-hash assigned-tasks)])
        (define taskid (assignment-taskid av))
        (define ts (hash-ref current-tasks taskid #f))
        (unless (false? ts)
          (task-unassign! ts id)))

      ;; call agent rpc to cancel everything
      (cancel-all-assignments))

    ;; helper to repeatedly invoke an agent rpc
    (define (invoke/retry-forever proc)
      (let init-loop ([retry-delay *min-retry-delay*])
        (define maybe-exn
          (with-handlers ([exn:fail? identity])
            (proc)
            #f))
        (when maybe-exn
          (log-server-error "agent ~a encountered error" id)
          ((error-display-handler) (exn-message maybe-exn) maybe-exn)
          (sleep retry-delay)
          (init-loop (min *max-retry-delay*
                          (* *retry-delay-ratio* retry-delay))))))

    ;; #t if a new assignment was added, otherwise #f
    (define (create-assignment! ts)
      (define requested-amount (hash-ref! task-size (task-state-id ts) *min-subtask-size*))
      ;; integer set of assignment data, or false
      (define assign-data (task-assign! ts id requested-amount))
      (cond
        [(false? assign-data) #f]
        [else
         (define start-time-utc (current-seconds))
         (define start-time-monotonic (current-seconds-monotonic))
         (define aid (make-assignment-id))
         (define mf-parsed (task-state-manifest ts))
         (define file-hash (task-state-file-hash ts))
         (define mf-raw (serialize-manifest mf-parsed))
         ;; add to local tracking
         (hash-set! assigned-tasks aid (assignment aid (task-state-id ts) start-time-utc
                                                   start-time-monotonic))
         ;; send agent rpc
         (invoke/retry-forever
           (lambda () (push-assignment aid (task-state-id ts) mf-raw file-hash
                                       (integer-set-contents assign-data))))
         #t]))

    (define (cancel-assignment! assignment)
      ;; tell the agent to cancel work, unassign the assignment
      (define ts (hash-ref current-tasks (assignment-taskid assignment) #f))
      ;; update manager tracking
      (unless (false? ts)
        (task-unassign! ts id))
      (hash-remove! assigned-tasks (assignment-id assignment))
      ;; send agent rpc
      (invoke/retry-forever (lambda () (cancel-assignment (assignment-id assignment)))))

    ;; records a match for a certain assignment
    (define (add-assignment-match! assignment success-input)
      ;; TODO : notify other things that a match occurred maybe?
      ;; set complete!!!
      (log-server-info "agent handler ~a: match for task ~a: ~a" id (assignment-taskid assignment)
                       success-input)
      (query-exec
        (current-db) q-add-task-match
        (assignment-taskid assignment) id (current-seconds) (s-exp->fasl success-input)))

    (define (complete-assignment! assignment)
      (log-server-info "agent handler ~a: completed assignment for task ~a" id
                       (assignment-taskid assignment))
      (define end-time-monotonic (current-seconds-monotonic))
      (define duration (- end-time-monotonic (assignment-start-time-monotonic assignment)))
      (define ts (hash-ref current-tasks (assignment-taskid assignment) #f))
      (hash-remove! assigned-tasks (assignment-id assignment))
      (unless (false? ts)
        ;; mark complete
        (task-complete! ts id (assignment-start-time-utc assignment) duration)
        ;; update next task size request
        (cond
          [(< duration *optimal-completion-secs*)
           (log-server-info "agent handler ~a: increasing task allocation" id)
           (hash-update! task-size (assignment-taskid assignment)
                         (lambda (v) (* 2 v)) *min-subtask-size*)]
          [(> duration (* 2 *optimal-completion-secs*))
           (log-server-info "agent handler ~a: decreasing task allocation" id)
           (hash-update! task-size (assignment-taskid assignment)
                         (lambda (v) (max *min-subtask-size* (/ 2 v))) *min-subtask-size*)]
          [else (void)])))

    ;; examine the current tasks and immediately assign anything that has work to do and that we
    ;; have resources for. this is idempotent, it can be called whenever there is any sort of state
    ;; change regarding assignments and it will take appropriate action
    ;; this is also the main point to introduce a potentially smarter scheduling algorithm. the
    ;; algorithm is currently "yolo it because we don't really expect many competing tasks at
    ;; the same time anyway" -- this means performance with nontrivial sets of resources and
    ;; a large number of concurrent tasks could be suboptimal
    (define (update-assignments!)
      (log-server-info "agent handler ~a: updating assignments" id)
      ;; detect set of used resources
      ;; see set of current inactive tasks
      ;; if a task uses all free resources, assign it
      (define used-resources
        (for/fold ([used-resources (set)]) ([(aid av) (in-hash assigned-tasks)])
          (define task (hash-ref current-tasks (assignment-taskid av)))
          (define manifest (task-state-manifest task))
          (set-union used-resources (list->set (manifest-data-ref manifest 'resources '())))))

      (define available-resources (set-subtract resources used-resources))

      (define assigned-taskids
        (for/set ([(aid av) (in-hash assigned-tasks)]) (assignment-taskid av)))

      ;; the scheduling prioritizer is currently:
      ;;   yolo just pick randomly
      ;; this should be good enough for all basic use cases
      (define task-list
        (shuffle
          (for/list ([(tid ts) (in-hash current-tasks)]
                #:unless (set-member? assigned-taskids tid))
            ts)))

      ;; assigns the first task in the list that we are capable of doing
      ;; returns if we could maybe assign more
      (define (select-task! task-list)
        (match task-list
          ['() #f]
          [(cons head tail)
           (define manifest (task-state-manifest head))
           (define needed-resources (list->set (manifest-data-ref manifest 'resources '())))
           (define needed-arch (manifest-data-ref manifest 'arch '("any")))
           (define right-arch? (or (member "any" needed-arch) (member arch needed-arch)))
           (if (and right-arch?
                    (task-has-work? head)
                    (subset? needed-resources available-resources))
             (create-assignment! head)
             (select-task! tail))]))

      (when (select-task! task-list)
        (update-assignments!)))

    (define (handle-command data)
      (match data
        [(cons 'new-task ts)
         (log-server-info "agent handler ~a got new task ~a" id (task-state-id ts))
         (hash-set! current-tasks (task-state-id ts) ts)
         (update-assignments!)]
        [(cons 'cancel-task tid)
         (log-server-info "agent handler ~a deregistering task ~a" id tid)
         (define assignments
           (for/list ([(_ v) (in-hash assigned-tasks)]
                      #:when (= tid (assignment-taskid v)))
             v))
         (for ([av (in-list assignments)])
           (cancel-assignment! av))
         (hash-remove! current-tasks tid)
         (hash-remove! task-size tid)
         (update-assignments!)]
        ;; got completion report from agent
        [(cons 'agent-report status)
         (match status
           ;; no current assignment
           [(cons #f _) (void)] ;; do nothing
           ;; current assignment incomplete
           [(cons assignment-id 'incomplete) (void)] ;; also do nothing
           ;; current assignment complete
           [(cons assignment-id 'complete)
            (define av (hash-ref assigned-tasks assignment-id #f))
            (unless (false? av) (complete-assignment! av))
            (update-assignments!)]
           ;; execution failed, unassign task
           ;; TODO : maybe blacklist this task from this agent if there are too many errors
           ;; TODO : notify connected clients that an error occurred
           [(cons assignment-id 'error)
            (define av (hash-ref assigned-tasks assignment-id #f))
            (unless (false? av) (cancel-assignment! av))
            (update-assignments!)]
           ;; got succeeding input
           [(cons assignment-id success-input)
            (define av (hash-ref assigned-tasks assignment-id #f))
            (unless (false? av) (add-assignment-match! av success-input))])]
        ['shutdown (set! run-agent-thd? #f)]))

    (define (handle-assignment-timeout)
      ;; on timeout, work is returned to the assignment pool and other agents may not be actually
      ;; notified of this. but because of work stealing they should have already attempting to
      ;; steal the work so there shouldn't actually be a situation where an agent thread is asleep
      ;; when work is returned to the pool
      (define time (current-seconds-monotonic))
      (define overdue-assignments
        (filter (lambda (av)
                  (define st (assignment-start-time-monotonic av))
                  (>= (- time st) *subtask-timeout*))
                (hash-values assigned-tasks)))
      (for ([overdue (in-list overdue-assignments)])
        (define taskid (assignment-taskid overdue))
        ;; revert to the smallest possible task size, just in case
        ;; we don't want to spin forever assigning potentially gigantic tasks and timing out
        ;; although ideally this case wouldn't occur because the timeout is three times the target
        ;; subtask duration
        (hash-set! task-size taskid *min-subtask-size*)
        (log-server-warning "agent ~a timed out on task ~a" id taskid)
        (cancel-assignment! overdue)))

    (with-handlers ([exn? (lambda (ex) (handle-cleanup) (raise ex))])
      ;; wait for agent node to become present
      ;; TODO : comms should have some sort notification mechanism for this

      (log-server-info "agent handler ~a waiting for agent" id)
      (let loop ()
        (unless (comms-channel-available? (current-comms) id)
          (sleep 10)
          (loop)))
      (log-server-info "agent handler ~a: agent online" id)

      ;; cancel whatever the agent is currently working on, in case the server crashed and came back
      ;; up or something
      ;; we want agent state synchronized with what we think it should be
      (invoke/retry-forever (lambda () (cancel-all-assignments)))

      (let loop ()
        ;; handle events
        (define time (current-seconds-monotonic))
        (define timeout-list
          (for/list ([(aid av) (in-hash assigned-tasks)])
            (define st (assignment-start-time-monotonic av))
            (max 0 (- *subtask-timeout* (- time st)))))
        (define nearest-timeout (if (empty? timeout-list) #f (apply min timeout-list)))
        (match (sync/timeout nearest-timeout msg-chan)
          [#f (handle-assignment-timeout)]
          [data (handle-command data)])
        (when run-agent-thd?
          (loop)))

      (handle-cleanup)
      (void)))

  ;; id to task-state
  (define current-tasks (make-hash))

  ;; hash of agents to handler channel
  (define agents (make-hash))

  ;; run handler
  (define run-handler? #t)

  ;; first two thread messages send the comms and tm
  (current-comms (thread-receive))
  (current-tm (thread-receive))

  (define (handle-stop-task task-id)
    ;; notify agents
    (for ([(id agent-data) (in-hash agents)])
      (async-channel-put (car agent-data) (cons 'cancel-task task-id)))
    (hash-remove! current-tasks task-id))

  (define (handle-thd-msg)
    (define (handle-delete-agent id)
      (match-define (cons chan thd) (hash-ref agents id))
      (async-channel-put chan 'shutdown)
      ;; give it some time, then break the thread
      (sync/timeout 5 thd)
      (break-thread thd)
      (hash-remove! agents id))

    (match (thread-receive)
      [(cons 'new-agent (cons id (cons arch resources)))
       (define cmd-channel (make-async-channel))
       (define thd
         (parameterize ([current-custodian cust])
           ;; TODO : add monitor thread that detects crashes and unassigns tasks
           (thread (lambda () (agent-thd id arch resources cmd-channel)))))
       ;; todo : notify agent of current tasks
       (hash-set! agents id (cons cmd-channel thd))]
      [(cons 'delete-agent id)
       (handle-delete-agent id)]
      [(cons 'new-task (cons id manifest))
       (define ts (initialize-task id manifest))
       (hash-set! current-tasks id ts)
       ;; notify agents
       (for ([(agent-id agent-data) (in-hash agents)])
         (log-server-info "registering task ~a with agent ~a" id agent-id)
         (async-channel-put (car agent-data) (cons 'new-task ts)))]
      [(cons 'agent-report (cons agent-id (cons assignment-id state)))
       (async-channel-put (car (hash-ref agents agent-id))
                          (cons 'agent-report (cons assignment-id state)))]
      [(cons 'cancel-task task-id)
       (handle-stop-task task-id)]
      ['shutdown
       (for ([(id _) (in-hash agents)])
         (handle-delete-agent id))
       (set! run-handler? #f)]
      [_ (error "unknown agent handler message")]))

  (let loop ()
    (define thd-evt (thread-receive-evt))
    (match (sync thd-evt)
      [(== thd-evt) (handle-thd-msg)])
    (when run-handler? (loop))))

(define (make-agent-handler)
  ;; TODO : monitor this thread because dying is probably fatal
  (thread agent-handler))

(define current-agent-handler (make-parameter #f))

(define (init-agent-handler [ah (current-agent-handler)])
  (thread-send ah (current-comms))
  (thread-send ah (current-tm)))

(define (agent-handler-new-agent id arch resources [ah (current-agent-handler)])
  (thread-send ah (cons 'new-agent (cons id (cons arch resources)))))

(define (agent-handler-delete-agent id [ah (current-agent-handler)])
  (thread-send ah (cons 'delete-agent id)))

(define (agent-handler-new-task id manifest [ah (current-agent-handler)])
  (thread-send ah (cons 'new-task (cons id manifest))))

(define (agent-handler-cancel-task task-id [ah (current-agent-handler)])
  (thread-send ah (cons 'cancel-task task-id)))

(define (agent-handler-shutdown [ah (current-agent-handler)])
  (thread-send ah 'shutdown)
  (thread-wait ah))

;; agent rpcs

;; report state 'incomplete 'complete 'error or a list of integer representing a success result
;; assignment-id is a numeric ID or #f if no currently assigned task
;; agents can call this as many times as they want -- it can serve as a sort of "ping"
(define (agent-report-state assignment-id state)
  (-> (or/c false? integer?) (or/c 'incomplete 'complete 'error (listof integer?)))
  (enforce-subject 'agent)
  (define agent-id (node-id (current-from-node)))
  ;; TODO : maybe wait for an actual completion here? idk
  (thread-send (current-agent-handler)
               (cons 'agent-report (cons agent-id (cons assignment-id state)))))


;; the server impl unit
(define-unit-from-context server-impl@ server^)

;; command line usage
(module+ main
  (require racket/cmdline)


  ;; basic server initialization
  (install-logging!)
  (log-server-info "starting crossfire-server v~a" (#%info-lookup 'version))

  (define server-config (call-with-input-file *server-config-path* read))
  (unless (list? server-config)
    (error "corrupted config file, expected a list of config entries"))
  (define (config-get key pred)
    (define (make-err-fmt actual)
      (format "invalid config!\n expected: (~a <value satisfying ~a>)\n got: ~a" key
              (if (contract? pred) (format "~v" pred) (object-name pred))
              actual))
    (match (assoc key server-config)
      [#f (error (make-err-fmt "<nothing>"))]
      [(list _ (? pred data)) data]
      [x (error (make-err-fmt x))]))

  (current-db (open-server-db 'create))
  (migrate-server-db)

  ;; load or create secret key
  (unless (file-exists? *server-seckey-path*)
    (log-server-info "generating new secret key")
    (call-with-output-file *server-seckey-path*
      (lambda (out)
        (write-bytes (crypto-sign-make-key) out)
        (port-fsync out))))

  ;; create client if it doesn't exist yet
  (when (empty? (get-nodes 'client))
    (log-server-info "creating new client")
    (make-node "client0" "x-client" 'client '())
    (void))

  (define seckey (file->bytes *server-seckey-path*))
  (define pubkey (crypto-sign-public-key seckey))
  (define listen-addr
    (match (config-get 'listen-addr (or/c 'auto string?))
      ['auto "0.0.0.0"]
      [addr addr]))
  (define server (node *server-node-id* (config-get 'name string?) 'server pubkey seckey
                       listen-addr (config-get 'listen-port integer?)))
  (define public-addr
    (match (config-get 'public-addr (or/c 'auto string?))
      ['auto (with-handlers ([exn:fail? (lambda (_) (error "failed to autodetect hostname"))])
               (get-hostname))]
      [addr addr]))
  (define public-port
    (match (config-get 'public-port (or/c 'auto integer?))
      ['auto (node-port server)]
      [port port]))
  (current-server-public-node
    (struct-copy node server [seckey #f] [host public-addr] [port public-port]))


  ;; read command line
  ;; TODO : read cmdline for admin commands
  ;; ideally allow the admin commands to coexist with an actual current running server

  (match (current-command-line-arguments)
    [(vector "dev-new-agent" name arch)
     (define-values [id _] (make-node name arch 'agent '("cpu")))
     (define agent-node (load-comms-node id #t))
     (call-with-output-file (build-path *state-root* (string-append name ".rktd"))
       (lambda (out) (write (list agent-node (current-server-public-node)) out)))
     (log-server-info "created dev agent ~a" name)
     (exit)]
    [(vector "dev-export-client")
     (define client-id (node-info-id (first (get-nodes 'client))))
     (define client-node (load-comms-node client-id #t))
     (define data (list client-node (current-server-public-node)))
     (with-output-to-file (build-path *state-root* "client0.rktd")
       (lambda () (write data)))
     (exit)]
    [(vector "dev-new-project" tgz)
     (define id (make-task '((name "test project") (arch "any")
                             (command "./test.sh")
                             (resources "cpu") (pattern "meow?d?d"))
                           (file->bytes tgz)))
     (log-server-info "created project")
     (exit)]
    [(vector subcmd _ ...)
     (error "invalid subcommand" subcmd)]
    [argv (void)])


  ;; start server and start doing stuff

  (current-agent-handler (make-agent-handler))
  (current-comms (make-comms server))
  (current-tm (make-transaction-manager server (current-comms)))
  (rpc-register-all server^ server-impl@)
  (init-agent-handler)

  ;; restore agents
  (for ([agent (in-list (get-nodes 'agent))])
    (log-server-info "restoring agent ~a" (node-info-id agent))
    (restore-comms-node (node-info-id agent))
    (agent-handler-new-agent (node-info-id agent) (node-info-arch agent)
                             (node-info-resources agent)))

  ;; restore clients
  (for ([client (in-list (get-nodes 'client))])
    (log-server-info "restoring client ~a" (node-info-id client))
    (restore-comms-node (node-info-id client)))

  ;; restore active tasks
  (for ([(id name manifest-in complete?) (in-query (current-db) q-get-tasks)]
        #:when (zero? complete?))
    (log-server-info "restoring project ~a" id)
    (define manifest (parse-manifest (fasl->s-exp manifest-in)))
    (agent-handler-new-task id manifest))

  ;; now server is ready to start listening
  (comms-listen (current-comms) (node-port server))

  (log-server-info "server running")

  ;; wait for break
  (with-handlers ([exn:break? void])
    (sync never-evt))

  ;; shutdown
  ;; a second break aborts clean shutdown. ideally, don't break again unless necessary
  (log-server-info "stopping server")
  (agent-handler-shutdown)
  (tm-shutdown (current-tm))
  (comms-shutdown (current-comms))

  (void))