fix agent bugs with task execution
This commit is contained in:
parent
8ec0861907
commit
b97bfcbc6f
4
Makefile
4
Makefile
|
@ -33,7 +33,9 @@ dev-rollback:
|
|||
raco north rollback -p crossfire/migrations -f
|
||||
|
||||
dev-make-agent:
|
||||
$(RM) agent-deployment/app.zo
|
||||
cd agent-deployment && $(MAKE)
|
||||
[ -d lib ] || mkdir lib
|
||||
cd agent-deployment && for dir in arch_*; do \
|
||||
mkdir ../lib/$$dir; cp $$dir/crossfire-agent ../lib/$$dir/; done
|
||||
[ -d ../lib/$$dir ] || mkdir ../lib/$$dir; \
|
||||
cp $$dir/crossfire-agent ../lib/$$dir/; done
|
||||
|
|
|
@ -44,23 +44,32 @@
|
|||
(parameterize ([current-custodian cust])
|
||||
(thread (lambda ()
|
||||
;; kinda pointless, other than helping keep the connection alive
|
||||
(let loop () (agent-report-state #f #f) (sleep *ping-secs*) (loop)))))
|
||||
(let loop ()
|
||||
(with-handlers ([exn:fail? (lambda (ex) ((error-display-handler) (exn-message ex) ex))])
|
||||
(agent-report-state #f #f))
|
||||
(sleep *ping-secs*) (loop)))))
|
||||
|
||||
(define last-cache-update (current-seconds-monotonic))
|
||||
(define run-agent? #t)
|
||||
(define assignments (make-hash))
|
||||
|
||||
(struct download [thd file-hash/hex extract-dir [waiters #:mutable]] #:transparent)
|
||||
(struct download [thd file-hash/hex extract-dir [waiters #:mutable] success] #:transparent)
|
||||
(define downloads (make-hash))
|
||||
|
||||
(define (download/extract tid tgz-file extract-dir)
|
||||
(define (download/extract tid tgz-file extract-dir success-box)
|
||||
(define (cleanup)
|
||||
(with-handlers ([exn:fail:filesystem? void]) (delete-directory/files tgz-file))
|
||||
(with-handlers ([exn:fail:filesystem? void]) (delete-directory/files extract-dir))
|
||||
(with-handlers ([exn:fail:filesystem? void]) (delete-directory/files extract-dir)))
|
||||
(log-agent-info "downloading task data for ~a" tid)
|
||||
(with-handlers ([exn:fail? (lambda (ex)
|
||||
((error-display-handler) (exn-message ex) ex)
|
||||
(cleanup))])
|
||||
;; TODO this should be updated with the streaming interface
|
||||
(call-with-output-file tgz-file (lambda (out) (write-bytes (get-project-file tid) out)))
|
||||
(call-with-output-file tgz-file (lambda (out) (write-bytes (get-project-file tid) out))
|
||||
#:exists 'truncate)
|
||||
(log-agent-info "extracting task data for ~a" tid)
|
||||
(untgz tgz-file #:dest extract-dir))
|
||||
(untgz tgz-file #:dest extract-dir)
|
||||
(set-box! success-box #t)))
|
||||
|
||||
(let loop ()
|
||||
(define cache-update-delta (max 0 (- (+ last-cache-update *max-cache-age*)
|
||||
|
@ -90,11 +99,21 @@
|
|||
;; download completed
|
||||
[(? thread? dl-thd)
|
||||
;; argh
|
||||
(match-define (download thd file-hash/hex extract-dir waiters)
|
||||
(define the-dl
|
||||
(for/first ([(tid dl) (in-hash downloads)] #:when (eq? dl-thd (download-thd dl)))
|
||||
(hash-remove! downloads tid)
|
||||
(log-agent-info "completed download for ~a" tid)
|
||||
dl))
|
||||
(cond
|
||||
[(false? the-dl) (log-agent-error "download completed, but missing record of it")]
|
||||
[(false? (unbox (download-success the-dl)))
|
||||
;; download failed, report error
|
||||
(for ([assignment (in-list (download-waiters the-dl))])
|
||||
(log-agent-error "download failed for ~a" (assignment-id assignment))
|
||||
(agent-report-state (assignment-id assignment) 'error))]
|
||||
[else
|
||||
;; start it
|
||||
(match-define (download thd file-hash/hex extract-dir waiters success-box) the-dl)
|
||||
(hash-set! cache-info file-hash/hex (current-seconds))
|
||||
(update-workdir-cache! workdir cache-info)
|
||||
(set! last-cache-update (current-seconds-monotonic))
|
||||
|
@ -102,7 +121,7 @@
|
|||
(for ([assignment (in-list waiters)])
|
||||
(parameterize ([current-custodian cust])
|
||||
(hash-set! assignments (assignment-id assignment)
|
||||
(thread (lambda () (execute-assignment assignment extract-dir))))))]
|
||||
(thread (lambda () (execute-assignment assignment extract-dir))))))])]
|
||||
[(cons 'new assignment)
|
||||
(define aid (assignment-id assignment))
|
||||
;; cancel old assignment with the same id, if exists
|
||||
|
@ -129,16 +148,18 @@
|
|||
(set-download-waiters! dl (cons assignment (download-waiters dl)))]
|
||||
[else
|
||||
(log-agent-info "starting download for ~a" tid)
|
||||
(define dl (download (thread (thunk (download/extract tid tgz-file extract-dir)))
|
||||
file-hash/hex extract-dir (list assignment)))
|
||||
(hash-set! downloads tid dl)])])
|
||||
(define success-box (box #f))
|
||||
(define dl
|
||||
(download (thread (thunk (download/extract tid tgz-file extract-dir success-box)))
|
||||
file-hash/hex extract-dir (list assignment) success-box))
|
||||
(hash-set! downloads tid dl)])]
|
||||
[x (log-agent-error "unexpected message" x)])
|
||||
|
||||
(when run-agent? (loop)))
|
||||
;; TODO : report errors for all in-progress assignments or something
|
||||
(custodian-shutdown-all cust))
|
||||
|
||||
(define (execute-assignment assignment extract-dir)
|
||||
;; TODO : on cancel-assignment, actually kill the process and stuff
|
||||
;; TODO : do local verification of resource usage. if the server starts an assignment that uses
|
||||
;; resource A and we're already using resource A, kill the old assignment
|
||||
|
||||
|
@ -187,7 +208,10 @@
|
|||
(parameterize ([current-custodian cust] [current-directory extract-dir])
|
||||
(apply subprocess #f #f (current-error-port) 'new (append cmd args))))
|
||||
|
||||
(with-handlers ([exn:break? (lambda (_) (subprocess-kill proc #f)
|
||||
(with-handlers ([exn:break? (lambda (_)
|
||||
(log-agent-info "killing process for ~a" aid)
|
||||
;; nicely ask the process to stop
|
||||
(subprocess-kill proc #f)
|
||||
(sync/timeout *subproc-kill-delay* proc)
|
||||
;; will handle killing for us :P
|
||||
(custodian-shutdown-all cust)
|
||||
|
@ -196,7 +220,7 @@
|
|||
(define line-match (regexp-match-evt #px"^[^\n]*\n" out))
|
||||
(define eof-e (eof-evt out))
|
||||
(let loop ([reached-eof #f] [proc-done #f])
|
||||
(match (sync proc line-match eof-e)
|
||||
(match (sync/enable-break proc line-match eof-e)
|
||||
[(== proc)
|
||||
(unless reached-eof
|
||||
(loop reached-eof #t))]
|
||||
|
|
Loading…
Reference in New Issue