if too many errors on a task, stop trying
This commit is contained in:
parent
594d785641
commit
1a66ff29f3
10
README.md
10
README.md
|
@ -148,11 +148,11 @@ contributions welcome,,,,
|
||||||
## status
|
## status
|
||||||
|
|
||||||
### base
|
### base
|
||||||
- 🚧 input space manipulation functions
|
- ✅ input space manipulation functions
|
||||||
- ✅ data types: using data/integer-set, pattern (vector of integer-set)
|
- ✅ data types: using data/integer-set, pattern (vector of integer-set)
|
||||||
- ✅ basic manipulation functions
|
- ✅ basic manipulation functions
|
||||||
- ✅ representation of input space as a flat integer
|
- ✅ representation of input space as a flat integer
|
||||||
- 🚧 #lang for configuration/definitions
|
- ✅ #lang for configuration/definitions
|
||||||
- (input) mode
|
- (input) mode
|
||||||
- stdio: user program gets input by stdio, integers separated by space, one per line
|
- stdio: user program gets input by stdio, integers separated by space, one per line
|
||||||
- callback: input generator compiled into user program, user main calls `crossfire_main`
|
- callback: input generator compiled into user program, user main calls `crossfire_main`
|
||||||
|
@ -161,8 +161,8 @@ contributions welcome,,,,
|
||||||
- SMP: performed by crossfire or performed by the user code
|
- SMP: performed by crossfire or performed by the user code
|
||||||
- "performed by user code" can also mean GPU, for example
|
- "performed by user code" can also mean GPU, for example
|
||||||
- 🚧 codegen for input generator (in C)
|
- 🚧 codegen for input generator (in C)
|
||||||
- ✅ stdio mode
|
- 🚧 stdio mode
|
||||||
- 🚧 callback mode
|
- ✅ callback mode
|
||||||
- ✅ success reporting mechanism
|
- ✅ success reporting mechanism
|
||||||
- low priority: configurable "character" type -- currently a "character" is a uint64\_t
|
- low priority: configurable "character" type -- currently a "character" is a uint64\_t
|
||||||
|
|
||||||
|
@ -172,7 +172,7 @@ contributions welcome,,,,
|
||||||
- low priority: randomized input space distribution
|
- low priority: randomized input space distribution
|
||||||
- low priority: store common configuration templates for clients
|
- low priority: store common configuration templates for clients
|
||||||
- low priority: track upload/download progress
|
- low priority: track upload/download progress
|
||||||
- streaming interface for file transfers
|
- ✅ streaming interface for file transfers
|
||||||
- ✅ accept submitted projects (with client-compiled input generator) and distribute to agents
|
- ✅ accept submitted projects (with client-compiled input generator) and distribute to agents
|
||||||
- ✅ low priority: support for multiple architectures
|
- ✅ low priority: support for multiple architectures
|
||||||
- ✅ agent authentication
|
- ✅ agent authentication
|
||||||
|
|
|
@ -407,6 +407,11 @@
|
||||||
(define *max-retry-delay* 120)
|
(define *max-retry-delay* 120)
|
||||||
(define *retry-delay-ratio* 2)
|
(define *retry-delay-ratio* 2)
|
||||||
|
|
||||||
|
;; every time agent reports an error, we increment error count
|
||||||
|
;; every time there is success, we decrement the error count (to 0)
|
||||||
|
;; threshold for error count before we stop trying to run this task
|
||||||
|
(define *max-task-errors* 10)
|
||||||
|
|
||||||
(define (agent-handler)
|
(define (agent-handler)
|
||||||
;; unlike comms, messages to agent-handler have no responses. just thread-send, it's one-way
|
;; unlike comms, messages to agent-handler have no responses. just thread-send, it's one-way
|
||||||
(define cust (make-custodian))
|
(define cust (make-custodian))
|
||||||
|
@ -427,7 +432,7 @@
|
||||||
;; precisely than just the taskid, which allows them to cache the file locally
|
;; precisely than just the taskid, which allows them to cache the file locally
|
||||||
;; completed-work: an integer set of completed work
|
;; completed-work: an integer set of completed work
|
||||||
(struct task-state [id sema manifest [work-pattern #:mutable] agent-todo file-hash
|
(struct task-state [id sema manifest [work-pattern #:mutable] agent-todo file-hash
|
||||||
[completed-work #:mutable]] #:transparent)
|
[completed-work #:mutable] error-log] #:transparent)
|
||||||
|
|
||||||
(define (initialize-task id mf)
|
(define (initialize-task id mf)
|
||||||
(define file-hash (server-hash-file id))
|
(define file-hash (server-hash-file id))
|
||||||
|
@ -441,7 +446,8 @@
|
||||||
([(_1 _2 _3 pat-fasl) (in-query (current-db) q-get-task-log id)])
|
([(_1 _2 _3 pat-fasl) (in-query (current-db) q-get-task-log id)])
|
||||||
(define sub (make-integer-set (fasl->s-exp pat-fasl)))
|
(define sub (make-integer-set (fasl->s-exp pat-fasl)))
|
||||||
(values (integer-set-subtract pattern-range sub) (integer-set-union completed-work sub))))
|
(values (integer-set-subtract pattern-range sub) (integer-set-union completed-work sub))))
|
||||||
(task-state id sema mf pattern-range agent-todo file-hash completed-work))
|
(define error-log (make-hash))
|
||||||
|
(task-state id sema mf pattern-range agent-todo file-hash completed-work error-log))
|
||||||
|
|
||||||
(define (task-has-work? ts)
|
(define (task-has-work? ts)
|
||||||
(not (and (zero? (integer-set-count (task-state-work-pattern ts)))
|
(not (and (zero? (integer-set-count (task-state-work-pattern ts)))
|
||||||
|
@ -498,8 +504,10 @@
|
||||||
assignment]))))
|
assignment]))))
|
||||||
|
|
||||||
;; returns work from agent back to the regular work pool
|
;; returns work from agent back to the regular work pool
|
||||||
(define (task-unassign! ts agent-id)
|
(define (task-unassign! ts agent-id [error? #f])
|
||||||
(call-with-semaphore (task-state-sema ts) (lambda ()
|
(call-with-semaphore (task-state-sema ts) (lambda ()
|
||||||
|
(when error?
|
||||||
|
(hash-update! (task-state-error-log ts) agent-id add1 0))
|
||||||
(match (hash-ref (task-state-agent-todo ts) agent-id #f)
|
(match (hash-ref (task-state-agent-todo ts) agent-id #f)
|
||||||
[#f (void)]
|
[#f (void)]
|
||||||
[assignment
|
[assignment
|
||||||
|
@ -511,6 +519,7 @@
|
||||||
;; adds to task log, then updates work pool with task completion
|
;; adds to task log, then updates work pool with task completion
|
||||||
(define (task-complete! ts agent-id time-wall-start duration)
|
(define (task-complete! ts agent-id time-wall-start duration)
|
||||||
(call-with-semaphore (task-state-sema ts) (lambda ()
|
(call-with-semaphore (task-state-sema ts) (lambda ()
|
||||||
|
(hash-update! (task-state-error-log ts) agent-id (lambda (x) (max 0 (sub1 x))) 0)
|
||||||
(match (hash-ref (task-state-agent-todo ts) agent-id #f)
|
(match (hash-ref (task-state-agent-todo ts) agent-id #f)
|
||||||
[#f (void)]
|
[#f (void)]
|
||||||
[assignment
|
[assignment
|
||||||
|
@ -619,12 +628,12 @@
|
||||||
(integer-set-contents assign-data)))
|
(integer-set-contents assign-data)))
|
||||||
#t]))
|
#t]))
|
||||||
|
|
||||||
(define (cancel-assignment! assignment)
|
(define (cancel-assignment! assignment [error? #f])
|
||||||
;; tell the agent to cancel work, unassign the assignment
|
;; tell the agent to cancel work, unassign the assignment
|
||||||
(define ts (hash-ref current-tasks (assignment-taskid assignment) #f))
|
(define ts (hash-ref current-tasks (assignment-taskid assignment) #f))
|
||||||
;; update manager tracking
|
;; update manager tracking
|
||||||
(unless (false? ts)
|
(unless (false? ts)
|
||||||
(task-unassign! ts id))
|
(task-unassign! ts id error?))
|
||||||
(hash-remove! assigned-tasks (assignment-id assignment))
|
(hash-remove! assigned-tasks (assignment-id assignment))
|
||||||
;; send agent rpc
|
;; send agent rpc
|
||||||
(with-handlers ([exn:fail?
|
(with-handlers ([exn:fail?
|
||||||
|
@ -705,10 +714,12 @@
|
||||||
['() #f]
|
['() #f]
|
||||||
[(cons head tail)
|
[(cons head tail)
|
||||||
(define manifest (task-state-manifest head))
|
(define manifest (task-state-manifest head))
|
||||||
|
(define error-count (hash-ref (task-state-error-log head) id 0))
|
||||||
(define needed-resources (list->set (manifest-data-ref manifest 'resources '())))
|
(define needed-resources (list->set (manifest-data-ref manifest 'resources '())))
|
||||||
(define needed-arch (manifest-data-ref manifest 'arch '("any")))
|
(define needed-arch (manifest-data-ref manifest 'arch '("any")))
|
||||||
(define right-arch? (or (member "any" needed-arch) (member arch needed-arch)))
|
(define right-arch? (or (member "any" needed-arch) (member arch needed-arch)))
|
||||||
(if (and right-arch?
|
(if (and right-arch?
|
||||||
|
(< error-count *max-task-errors*)
|
||||||
(task-has-work? head)
|
(task-has-work? head)
|
||||||
(subset? needed-resources available-resources))
|
(subset? needed-resources available-resources))
|
||||||
(create-assignment! head)
|
(create-assignment! head)
|
||||||
|
@ -747,11 +758,12 @@
|
||||||
(unless (false? av) (complete-assignment! av))
|
(unless (false? av) (complete-assignment! av))
|
||||||
(update-assignments!)]
|
(update-assignments!)]
|
||||||
;; execution failed, unassign task
|
;; execution failed, unassign task
|
||||||
;; TODO : maybe blacklist this task from this agent if there are too many errors
|
|
||||||
;; TODO : notify connected clients that an error occurred
|
;; TODO : notify connected clients that an error occurred
|
||||||
[(cons assignment-id 'error)
|
[(cons assignment-id 'error)
|
||||||
(define av (hash-ref assigned-tasks assignment-id #f))
|
(define av (hash-ref assigned-tasks assignment-id #f))
|
||||||
(unless (false? av) (cancel-assignment! av))
|
(unless (false? av)
|
||||||
|
;; #t for yes, this is an error
|
||||||
|
(cancel-assignment! av #t))
|
||||||
(update-assignments!)]
|
(update-assignments!)]
|
||||||
;; got succeeding input
|
;; got succeeding input
|
||||||
[(cons assignment-id success-input)
|
[(cons assignment-id success-input)
|
||||||
|
|
Loading…
Reference in New Issue