if too many errors on a task, stop trying

This commit is contained in:
xenia 2021-01-10 01:05:23 -05:00
parent 594d785641
commit 1a66ff29f3
2 changed files with 24 additions and 12 deletions

View File

@ -148,11 +148,11 @@ contributions welcome,,,,
## status
### base
- 🚧 input space manipulation functions
- input space manipulation functions
- ✅ data types: using data/integer-set, pattern (vector of integer-set)
- ✅ basic manipulation functions
- ✅ representation of input space as a flat integer
- 🚧 #lang for configuration/definitions
- #lang for configuration/definitions
- (input) mode
- stdio: user program gets input by stdio, integers separated by space, one per line
- callback: input generator compiled into user program, user main calls `crossfire_main`
@ -161,8 +161,8 @@ contributions welcome,,,,
- SMP: performed by crossfire or performed by the user code
- "performed by user code" can also mean GPU, for example
- 🚧 codegen for input generator (in C)
- stdio mode
- 🚧 callback mode
- 🚧 stdio mode
- callback mode
- ✅ success reporting mechanism
- low priority: configurable "character" type -- currently a "character" is a uint64\_t
@ -172,7 +172,7 @@ contributions welcome,,,,
- low priority: randomized input space distribution
- low priority: store common configuration templates for clients
- low priority: track upload/download progress
- streaming interface for file transfers
- streaming interface for file transfers
- ✅ accept submitted projects (with client-compiled input generator) and distribute to agents
- ✅ low priority: support for multiple architectures
- ✅ agent authentication

View File

@ -407,6 +407,11 @@
(define *max-retry-delay* 120)
(define *retry-delay-ratio* 2)
;; every time agent reports an error, we increment error count
;; every time there is success, we decrement the error count (to 0)
;; threshold for error count before we stop trying to run this task
(define *max-task-errors* 10)
(define (agent-handler)
;; unlike comms, messages to agent-handler have no responses. just thread-send, it's one-way
(define cust (make-custodian))
@ -427,7 +432,7 @@
;; precisely than just the taskid, which allows them to cache the file locally
;; completed-work: an integer set of completed work
(struct task-state [id sema manifest [work-pattern #:mutable] agent-todo file-hash
[completed-work #:mutable]] #:transparent)
[completed-work #:mutable] error-log] #:transparent)
(define (initialize-task id mf)
(define file-hash (server-hash-file id))
@ -441,7 +446,8 @@
([(_1 _2 _3 pat-fasl) (in-query (current-db) q-get-task-log id)])
(define sub (make-integer-set (fasl->s-exp pat-fasl)))
(values (integer-set-subtract pattern-range sub) (integer-set-union completed-work sub))))
(task-state id sema mf pattern-range agent-todo file-hash completed-work))
(define error-log (make-hash))
(task-state id sema mf pattern-range agent-todo file-hash completed-work error-log))
(define (task-has-work? ts)
(not (and (zero? (integer-set-count (task-state-work-pattern ts)))
@ -498,8 +504,10 @@
assignment]))))
;; returns work from agent back to the regular work pool
(define (task-unassign! ts agent-id)
(define (task-unassign! ts agent-id [error? #f])
(call-with-semaphore (task-state-sema ts) (lambda ()
(when error?
(hash-update! (task-state-error-log ts) agent-id add1 0))
(match (hash-ref (task-state-agent-todo ts) agent-id #f)
[#f (void)]
[assignment
@ -511,6 +519,7 @@
;; adds to task log, then updates work pool with task completion
(define (task-complete! ts agent-id time-wall-start duration)
(call-with-semaphore (task-state-sema ts) (lambda ()
(hash-update! (task-state-error-log ts) agent-id (lambda (x) (max 0 (sub1 x))) 0)
(match (hash-ref (task-state-agent-todo ts) agent-id #f)
[#f (void)]
[assignment
@ -619,12 +628,12 @@
(integer-set-contents assign-data)))
#t]))
(define (cancel-assignment! assignment)
(define (cancel-assignment! assignment [error? #f])
;; tell the agent to cancel work, unassign the assignment
(define ts (hash-ref current-tasks (assignment-taskid assignment) #f))
;; update manager tracking
(unless (false? ts)
(task-unassign! ts id))
(task-unassign! ts id error?))
(hash-remove! assigned-tasks (assignment-id assignment))
;; send agent rpc
(with-handlers ([exn:fail?
@ -705,10 +714,12 @@
['() #f]
[(cons head tail)
(define manifest (task-state-manifest head))
(define error-count (hash-ref (task-state-error-log head) id 0))
(define needed-resources (list->set (manifest-data-ref manifest 'resources '())))
(define needed-arch (manifest-data-ref manifest 'arch '("any")))
(define right-arch? (or (member "any" needed-arch) (member arch needed-arch)))
(if (and right-arch?
(< error-count *max-task-errors*)
(task-has-work? head)
(subset? needed-resources available-resources))
(create-assignment! head)
@ -747,11 +758,12 @@
(unless (false? av) (complete-assignment! av))
(update-assignments!)]
;; execution failed, unassign task
;; TODO : maybe blacklist this task from this agent if there are too many errors
;; TODO : notify connected clients that an error occurred
[(cons assignment-id 'error)
(define av (hash-ref assigned-tasks assignment-id #f))
(unless (false? av) (cancel-assignment! av))
(unless (false? av)
;; #t for yes, this is an error
(cancel-assignment! av #t))
(update-assignments!)]
;; got succeeding input
[(cons assignment-id success-input)