if too many errors on a task, stop trying
This commit is contained in:
parent
594d785641
commit
1a66ff29f3
10
README.md
10
README.md
|
@ -148,11 +148,11 @@ contributions welcome,,,,
|
|||
## status
|
||||
|
||||
### base
|
||||
- 🚧 input space manipulation functions
|
||||
- ✅ input space manipulation functions
|
||||
- ✅ data types: using data/integer-set, pattern (vector of integer-set)
|
||||
- ✅ basic manipulation functions
|
||||
- ✅ representation of input space as a flat integer
|
||||
- 🚧 #lang for configuration/definitions
|
||||
- ✅ #lang for configuration/definitions
|
||||
- (input) mode
|
||||
- stdio: user program gets input by stdio, integers separated by space, one per line
|
||||
- callback: input generator compiled into user program, user main calls `crossfire_main`
|
||||
|
@ -161,8 +161,8 @@ contributions welcome,,,,
|
|||
- SMP: performed by crossfire or performed by the user code
|
||||
- "performed by user code" can also mean GPU, for example
|
||||
- 🚧 codegen for input generator (in C)
|
||||
- ✅ stdio mode
|
||||
- 🚧 callback mode
|
||||
- 🚧 stdio mode
|
||||
- ✅ callback mode
|
||||
- ✅ success reporting mechanism
|
||||
- low priority: configurable "character" type -- currently a "character" is a uint64\_t
|
||||
|
||||
|
@ -172,7 +172,7 @@ contributions welcome,,,,
|
|||
- low priority: randomized input space distribution
|
||||
- low priority: store common configuration templates for clients
|
||||
- low priority: track upload/download progress
|
||||
- streaming interface for file transfers
|
||||
- ✅ streaming interface for file transfers
|
||||
- ✅ accept submitted projects (with client-compiled input generator) and distribute to agents
|
||||
- ✅ low priority: support for multiple architectures
|
||||
- ✅ agent authentication
|
||||
|
|
|
@ -407,6 +407,11 @@
|
|||
(define *max-retry-delay* 120)
|
||||
(define *retry-delay-ratio* 2)
|
||||
|
||||
;; every time agent reports an error, we increment error count
|
||||
;; every time there is success, we decrement the error count (to 0)
|
||||
;; threshold for error count before we stop trying to run this task
|
||||
(define *max-task-errors* 10)
|
||||
|
||||
(define (agent-handler)
|
||||
;; unlike comms, messages to agent-handler have no responses. just thread-send, it's one-way
|
||||
(define cust (make-custodian))
|
||||
|
@ -427,7 +432,7 @@
|
|||
;; precisely than just the taskid, which allows them to cache the file locally
|
||||
;; completed-work: an integer set of completed work
|
||||
(struct task-state [id sema manifest [work-pattern #:mutable] agent-todo file-hash
|
||||
[completed-work #:mutable]] #:transparent)
|
||||
[completed-work #:mutable] error-log] #:transparent)
|
||||
|
||||
(define (initialize-task id mf)
|
||||
(define file-hash (server-hash-file id))
|
||||
|
@ -441,7 +446,8 @@
|
|||
([(_1 _2 _3 pat-fasl) (in-query (current-db) q-get-task-log id)])
|
||||
(define sub (make-integer-set (fasl->s-exp pat-fasl)))
|
||||
(values (integer-set-subtract pattern-range sub) (integer-set-union completed-work sub))))
|
||||
(task-state id sema mf pattern-range agent-todo file-hash completed-work))
|
||||
(define error-log (make-hash))
|
||||
(task-state id sema mf pattern-range agent-todo file-hash completed-work error-log))
|
||||
|
||||
(define (task-has-work? ts)
|
||||
(not (and (zero? (integer-set-count (task-state-work-pattern ts)))
|
||||
|
@ -498,8 +504,10 @@
|
|||
assignment]))))
|
||||
|
||||
;; returns work from agent back to the regular work pool
|
||||
(define (task-unassign! ts agent-id)
|
||||
(define (task-unassign! ts agent-id [error? #f])
|
||||
(call-with-semaphore (task-state-sema ts) (lambda ()
|
||||
(when error?
|
||||
(hash-update! (task-state-error-log ts) agent-id add1 0))
|
||||
(match (hash-ref (task-state-agent-todo ts) agent-id #f)
|
||||
[#f (void)]
|
||||
[assignment
|
||||
|
@ -511,6 +519,7 @@
|
|||
;; adds to task log, then updates work pool with task completion
|
||||
(define (task-complete! ts agent-id time-wall-start duration)
|
||||
(call-with-semaphore (task-state-sema ts) (lambda ()
|
||||
(hash-update! (task-state-error-log ts) agent-id (lambda (x) (max 0 (sub1 x))) 0)
|
||||
(match (hash-ref (task-state-agent-todo ts) agent-id #f)
|
||||
[#f (void)]
|
||||
[assignment
|
||||
|
@ -619,12 +628,12 @@
|
|||
(integer-set-contents assign-data)))
|
||||
#t]))
|
||||
|
||||
(define (cancel-assignment! assignment)
|
||||
(define (cancel-assignment! assignment [error? #f])
|
||||
;; tell the agent to cancel work, unassign the assignment
|
||||
(define ts (hash-ref current-tasks (assignment-taskid assignment) #f))
|
||||
;; update manager tracking
|
||||
(unless (false? ts)
|
||||
(task-unassign! ts id))
|
||||
(task-unassign! ts id error?))
|
||||
(hash-remove! assigned-tasks (assignment-id assignment))
|
||||
;; send agent rpc
|
||||
(with-handlers ([exn:fail?
|
||||
|
@ -705,10 +714,12 @@
|
|||
['() #f]
|
||||
[(cons head tail)
|
||||
(define manifest (task-state-manifest head))
|
||||
(define error-count (hash-ref (task-state-error-log head) id 0))
|
||||
(define needed-resources (list->set (manifest-data-ref manifest 'resources '())))
|
||||
(define needed-arch (manifest-data-ref manifest 'arch '("any")))
|
||||
(define right-arch? (or (member "any" needed-arch) (member arch needed-arch)))
|
||||
(if (and right-arch?
|
||||
(< error-count *max-task-errors*)
|
||||
(task-has-work? head)
|
||||
(subset? needed-resources available-resources))
|
||||
(create-assignment! head)
|
||||
|
@ -747,11 +758,12 @@
|
|||
(unless (false? av) (complete-assignment! av))
|
||||
(update-assignments!)]
|
||||
;; execution failed, unassign task
|
||||
;; TODO : maybe blacklist this task from this agent if there are too many errors
|
||||
;; TODO : notify connected clients that an error occurred
|
||||
[(cons assignment-id 'error)
|
||||
(define av (hash-ref assigned-tasks assignment-id #f))
|
||||
(unless (false? av) (cancel-assignment! av))
|
||||
(unless (false? av)
|
||||
;; #t for yes, this is an error
|
||||
(cancel-assignment! av #t))
|
||||
(update-assignments!)]
|
||||
;; got succeeding input
|
||||
[(cons assignment-id success-input)
|
||||
|
|
Loading…
Reference in New Issue