ApiServer: c.APIHost,
ApiToken: c.AuthToken,
ApiInsecure: c.Insecure,
- Client: &http.Client{Transport: &http.Transport{
- TLSClientConfig: MakeTLSConfig(c.Insecure)}},
+ Client: &http.Client{
+ Timeout: 5 * time.Minute,
+ Transport: &http.Transport{
+ TLSClientConfig: MakeTLSConfig(c.Insecure)},
+ },
External: false,
Retries: 2,
KeepServiceURIs: c.KeepServiceURIs,
import (
"context"
"fmt"
- "log"
"sync"
"time"
"git.curoverse.com/arvados.git/sdk/go/arvados"
"git.curoverse.com/arvados.git/sdk/go/arvadosclient"
+ "github.com/Sirupsen/logrus"
)
const (
Cancelled = arvados.ContainerStateCancelled
)
+type Logger interface {
+ Printf(string, ...interface{})
+ Warnf(string, ...interface{})
+ Debugf(string, ...interface{})
+}
+
// Dispatcher struct
type Dispatcher struct {
Arv *arvadosclient.ArvadosClient
+ Logger Logger
+
// Batch size for container queries
BatchSize int64
// dispatcher's token. When a new one appears, Run calls RunContainer
// in a new goroutine.
func (d *Dispatcher) Run(ctx context.Context) error {
+ if d.Logger == nil {
+ d.Logger = logrus.StandardLogger()
+ }
+
err := d.Arv.Call("GET", "api_client_authorizations", "", "current", nil, &d.auth)
if err != nil {
return fmt.Errorf("error getting my token UUID: %v", err)
// Containers that I know about that didn't show up in any
// query should be let go.
for uuid, tracker := range todo {
- log.Printf("Container %q not returned by any query, stopping tracking.", uuid)
+ d.Logger.Printf("Container %q not returned by any query, stopping tracking.", uuid)
tracker.close()
}
// Start a runner in a new goroutine, and send the initial container
// record to its updates channel.
func (d *Dispatcher) start(c arvados.Container) *runTracker {
- tracker := &runTracker{updates: make(chan arvados.Container, 1)}
+ tracker := &runTracker{
+ updates: make(chan arvados.Container, 1),
+ logger: d.Logger,
+ }
tracker.updates <- c
go func() {
d.RunContainer(d, c, tracker.updates)
"order": []string{"priority desc"}}
err := d.Arv.List("containers", params, &countList)
if err != nil {
- log.Printf("error getting count of containers: %q", err)
+ d.Logger.Warnf("error getting count of containers: %q", err)
return false
}
itemsAvailable := countList.ItemsAvailable
err := d.Arv.List("containers", params, &list)
if err != nil {
- log.Printf("Error getting list of containers: %q", err)
+ d.Logger.Warnf("error getting list of containers: %q", err)
return false
}
d.checkListForUpdates(list.Items, todo)
delete(todo, c.UUID)
if c.LockedByUUID != "" && c.LockedByUUID != d.auth.UUID {
- log.Printf("debug: ignoring %s locked by %s", c.UUID, c.LockedByUUID)
+ d.Logger.Debugf("ignoring %s locked by %s", c.UUID, c.LockedByUUID)
} else if alreadyTracking {
switch c.State {
case Queued:
}
err := d.lock(c.UUID)
if err != nil {
- log.Printf("debug: error locking container %s: %s", c.UUID, err)
+ d.Logger.Warnf("error locking container %s: %s", c.UUID, err)
break
}
c.State = Locked
"container": arvadosclient.Dict{"state": state},
}, nil)
if err != nil {
- log.Printf("Error updating container %s to state %q: %s", uuid, state, err)
+ d.Logger.Warnf("error updating container %s to state %q: %s", uuid, state, err)
}
return err
}
type runTracker struct {
closing bool
updates chan arvados.Container
+ logger Logger
}
func (tracker *runTracker) close() {
}
select {
case <-tracker.updates:
- log.Printf("debug: runner is handling updates slowly, discarded previous update for %s", c.UUID)
+ tracker.logger.Debugf("runner is handling updates slowly, discarded previous update for %s", c.UUID)
default:
}
tracker.updates <- c
"context"
"flag"
"fmt"
- "log"
"os"
"os/exec"
"os/signal"
"git.curoverse.com/arvados.git/sdk/go/arvados"
"git.curoverse.com/arvados.git/sdk/go/arvadosclient"
"git.curoverse.com/arvados.git/sdk/go/dispatch"
+ "github.com/Sirupsen/logrus"
)
var version = "dev"
func main() {
err := doMain()
if err != nil {
- log.Fatalf("%q", err)
+ logrus.Fatalf("%q", err)
}
}
)
func doMain() error {
+ logger := logrus.StandardLogger()
+ if os.Getenv("DEBUG") != "" {
+ logger.SetLevel(logrus.DebugLevel)
+ }
+ logger.Formatter = &logrus.JSONFormatter{
+ TimestampFormat: "2006-01-02T15:04:05.000000000Z07:00",
+ }
+
flags := flag.NewFlagSet("crunch-dispatch-local", flag.ExitOnError)
pollInterval := flags.Int(
return nil
}
- log.Printf("crunch-dispatch-local %s started", version)
+ logger.Printf("crunch-dispatch-local %s started", version)
runningCmds = make(map[string]*exec.Cmd)
arv, err := arvadosclient.MakeArvadosClient()
if err != nil {
- log.Printf("Error making Arvados client: %v", err)
+ logger.Errorf("error making Arvados client: %v", err)
return err
}
arv.Retries = 25
dispatcher := dispatch.Dispatcher{
+ Logger: logger,
Arv: arv,
RunContainer: run,
PollPeriod: time.Duration(*pollInterval) * time.Second,
c := make(chan os.Signal, 1)
signal.Notify(c, os.Interrupt, syscall.SIGTERM, syscall.SIGQUIT)
sig := <-c
- log.Printf("Received %s, shutting down", sig)
+ logger.Printf("Received %s, shutting down", sig)
signal.Stop(c)
cancel()
cmd.Stderr = os.Stderr
cmd.Stdout = os.Stderr
- log.Printf("Starting container %v", uuid)
+ dispatcher.Logger.Printf("starting container %v", uuid)
// Add this crunch job to the list of runningCmds only if we
// succeed in starting crunch-run.
runningCmdsMutex.Lock()
if err := startCmd(container, cmd); err != nil {
runningCmdsMutex.Unlock()
- log.Printf("Error starting %v for %v: %q", *crunchRunCommand, uuid, err)
+ dispatcher.Logger.Warnf("error starting %q for %s: %s", *crunchRunCommand, uuid, err)
dispatcher.UpdateState(uuid, dispatch.Cancelled)
} else {
runningCmds[uuid] = cmd
go func() {
if _, err := cmd.Process.Wait(); err != nil {
- log.Printf("Error while waiting for crunch job to finish for %v: %q", uuid, err)
+ dispatcher.Logger.Warnf("error while waiting for crunch job to finish for %v: %q", uuid, err)
}
- log.Printf("sending done")
+ dispatcher.Logger.Debugf("sending done")
done <- struct{}{}
}()
case c := <-status:
// Interrupt the child process if priority changes to 0
if (c.State == dispatch.Locked || c.State == dispatch.Running) && c.Priority == 0 {
- log.Printf("Sending SIGINT to pid %d to cancel container %v", cmd.Process.Pid, uuid)
+ dispatcher.Logger.Printf("sending SIGINT to pid %d to cancel container %v", cmd.Process.Pid, uuid)
cmd.Process.Signal(os.Interrupt)
}
}
}
close(done)
- log.Printf("Finished container run for %v", uuid)
+ dispatcher.Logger.Printf("finished container run for %v", uuid)
// Remove the crunch job from runningCmds
runningCmdsMutex.Lock()
// If the container is not finalized, then change it to "Cancelled".
err := dispatcher.Arv.Get("containers", uuid, nil, &container)
if err != nil {
- log.Printf("Error getting final container state: %v", err)
+ dispatcher.Logger.Warnf("error getting final container state: %v", err)
}
if container.State == dispatch.Locked || container.State == dispatch.Running {
- log.Printf("After %s process termination, container state for %v is %q. Updating it to %q",
- *crunchRunCommand, container.State, uuid, dispatch.Cancelled)
+ dispatcher.Logger.Warnf("after %q process termination, container state for %v is %q; updating it to %q",
+ *crunchRunCommand, uuid, container.State, dispatch.Cancelled)
dispatcher.UpdateState(uuid, dispatch.Cancelled)
}
for range status {
}
- log.Printf("Finalized container %v", uuid)
+ dispatcher.Logger.Printf("finalized container %v", uuid)
}
"bytes"
"context"
"io"
- "log"
"net/http"
"net/http/httptest"
"os"
"os/exec"
- "strings"
+ "regexp"
"testing"
"time"
"git.curoverse.com/arvados.git/sdk/go/arvadosclient"
"git.curoverse.com/arvados.git/sdk/go/arvadostest"
"git.curoverse.com/arvados.git/sdk/go/dispatch"
+ "github.com/Sirupsen/logrus"
. "gopkg.in/check.v1"
)
initialArgs = os.Args
arvadostest.StartAPI()
runningCmds = make(map[string]*exec.Cmd)
+ logrus.SetFormatter(&logrus.TextFormatter{DisableColors: true})
}
func (s *TestSuite) TearDownSuite(c *C) {
arvadostest.StubResponse{200, string(`{"uuid":"zzzzz-dz642-xxxxxxxxxxxxxx2", "state":"Running", "priority":1, "locked_by_uuid": "` + arvadostest.Dispatch1AuthUUID + `"}`)}
testWithServerStub(c, apiStubResponses, "echo",
- `After echo process termination, container state for Running is "zzzzz-dz642-xxxxxxxxxxxxxx2". Updating it to "Cancelled"`)
+ `after \\"echo\\" process termination, container state for zzzzz-dz642-xxxxxxxxxxxxxx2 is \\"Running\\"; updating it to \\"Cancelled\\"`)
}
func (s *MockArvadosServerSuite) Test_ErrorRunningContainer(c *C) {
apiStubResponses["/arvados/v1/containers/zzzzz-dz642-xxxxxxxxxxxxxx3/lock"] =
arvadostest.StubResponse{200, string(`{"uuid":"zzzzz-dz642-xxxxxxxxxxxxxx3", "state":"Locked", "priority":1}`)}
- testWithServerStub(c, apiStubResponses, "nosuchcommand", "Error starting nosuchcommand for zzzzz-dz642-xxxxxxxxxxxxxx3")
+ testWithServerStub(c, apiStubResponses, "nosuchcommand", `error starting \\"nosuchcommand\\" for zzzzz-dz642-xxxxxxxxxxxxxx3`)
}
func testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubResponse, crunchCmd string, expected string) {
}
buf := bytes.NewBuffer(nil)
- log.SetOutput(io.MultiWriter(buf, os.Stderr))
- defer log.SetOutput(os.Stderr)
+ logrus.SetOutput(io.MultiWriter(buf, os.Stderr))
+ defer logrus.SetOutput(os.Stderr)
*crunchRunCommand = crunchCmd
ctx, cancel := context.WithCancel(context.Background())
dispatcher := dispatch.Dispatcher{
Arv: arv,
- PollPeriod: time.Duration(1) * time.Second,
+ PollPeriod: time.Second / 20,
RunContainer: func(d *dispatch.Dispatcher, c arvados.Container, s <-chan arvados.Container) {
run(d, c, s)
cancel()
return cmd.Start()
}
+ re := regexp.MustCompile(`(?ms).*` + expected + `.*`)
go func() {
- for i := 0; i < 80 && !strings.Contains(buf.String(), expected); i++ {
+ for i := 0; i < 80 && !re.MatchString(buf.String()); i++ {
time.Sleep(100 * time.Millisecond)
}
cancel()
"git.curoverse.com/arvados.git/sdk/go/arvadosclient"
"git.curoverse.com/arvados.git/sdk/go/config"
"git.curoverse.com/arvados.git/sdk/go/dispatch"
+ "github.com/Sirupsen/logrus"
"github.com/coreos/go-systemd/daemon"
)
+type logger interface {
+ dispatch.Logger
+ Fatalf(string, ...interface{})
+}
+
const initialNiceValue int64 = 10000
var (
type Dispatcher struct {
*dispatch.Dispatcher
+ logger logrus.FieldLogger
cluster *arvados.Cluster
sqCheck *SqueueChecker
slurm Slurm
}
func main() {
- disp := &Dispatcher{}
+ logger := logrus.StandardLogger()
+ if os.Getenv("DEBUG") != "" {
+ logger.SetLevel(logrus.DebugLevel)
+ }
+ logger.Formatter = &logrus.JSONFormatter{
+ TimestampFormat: "2006-01-02T15:04:05.000000000Z07:00",
+ }
+ disp := &Dispatcher{logger: logger}
err := disp.Run(os.Args[0], os.Args[1:])
if err != nil {
- log.Fatal(err)
+ logrus.Fatalf("%s", err)
}
}
return nil
}
- log.Printf("crunch-dispatch-slurm %s started", version)
+ disp.logger.Printf("crunch-dispatch-slurm %s started", version)
err := disp.readConfig(*configPath)
if err != nil {
os.Setenv("ARVADOS_KEEP_SERVICES", strings.Join(disp.Client.KeepServiceURIs, " "))
os.Setenv("ARVADOS_EXTERNAL_CLIENT", "")
} else {
- log.Printf("warning: Client credentials missing from config, so falling back on environment variables (deprecated).")
+ disp.logger.Warnf("Client credentials missing from config, so falling back on environment variables (deprecated).")
}
if *dumpConfig {
siteConfig, err := arvados.GetConfig(arvados.DefaultConfigFile)
if os.IsNotExist(err) {
- log.Printf("warning: no cluster config (%s), proceeding with no node types defined", err)
+ disp.logger.Warnf("no cluster config (%s), proceeding with no node types defined", err)
} else if err != nil {
return fmt.Errorf("error loading config: %s", err)
} else if disp.cluster, err = siteConfig.GetCluster(""); err != nil {
// setup() initializes private fields after configure().
func (disp *Dispatcher) setup() {
+ if disp.logger == nil {
+ disp.logger = logrus.StandardLogger()
+ }
arv, err := arvadosclient.MakeArvadosClient()
if err != nil {
- log.Fatalf("Error making Arvados client: %v", err)
+ disp.logger.Fatalf("Error making Arvados client: %v", err)
}
arv.Retries = 25
disp.slurm = &slurmCLI{}
disp.sqCheck = &SqueueChecker{
+ Logger: disp.logger,
Period: time.Duration(disp.PollPeriod),
PrioritySpread: disp.PrioritySpread,
Slurm: disp.slurm,
}
disp.Dispatcher = &dispatch.Dispatcher{
Arv: arv,
+ Logger: disp.logger,
BatchSize: disp.BatchSize,
RunContainer: disp.runContainer,
PollPeriod: time.Duration(disp.PollPeriod),
case <-ctx.Done():
// Disappeared from squeue
if err := disp.Arv.Get("containers", ctr.UUID, nil, &ctr); err != nil {
- log.Printf("Error getting final container state for %s: %s", ctr.UUID, err)
+ log.Printf("error getting final container state for %s: %s", ctr.UUID, err)
}
switch ctr.State {
case dispatch.Running:
"fmt"
"io"
"io/ioutil"
- "log"
"net/http"
"net/http/httptest"
"os"
"git.curoverse.com/arvados.git/sdk/go/arvadosclient"
"git.curoverse.com/arvados.git/sdk/go/arvadostest"
"git.curoverse.com/arvados.git/sdk/go/dispatch"
+ "github.com/Sirupsen/logrus"
. "gopkg.in/check.v1"
)
}
s.disp.slurm = &s.slurm
- s.disp.sqCheck = &SqueueChecker{Period: 500 * time.Millisecond, Slurm: s.disp.slurm}
+ s.disp.sqCheck = &SqueueChecker{
+ Logger: logrus.StandardLogger(),
+ Period: 500 * time.Millisecond,
+ Slurm: s.disp.slurm,
+ }
err = s.disp.Dispatcher.Run(ctx)
<-doneRun
}
buf := bytes.NewBuffer(nil)
- log.SetOutput(io.MultiWriter(buf, os.Stderr))
- defer log.SetOutput(os.Stderr)
+ logrus.SetOutput(io.MultiWriter(buf, os.Stderr))
+ defer logrus.SetOutput(os.Stderr)
s.disp.CrunchRunCommand = []string{crunchCmd}
import (
"bytes"
"fmt"
- "log"
"sort"
"strings"
"sync"
// Squeue implements asynchronous polling monitor of the SLURM queue using the
// command 'squeue'.
type SqueueChecker struct {
+ Logger logger
Period time.Duration
PrioritySpread int64
Slurm Slurm
}
err := sqc.Slurm.Renice(job.uuid, niceNew)
if err != nil && niceNew > slurm15NiceLimit && strings.Contains(err.Error(), "Invalid nice value") {
- log.Printf("container %q clamping nice values at %d, priority order will not be correct -- see https://dev.arvados.org/projects/arvados/wiki/SLURM_integration#Limited-nice-values-SLURM-15", job.uuid, slurm15NiceLimit)
+ sqc.Logger.Warnf("container %q clamping nice values at %d, priority order will not be correct -- see https://dev.arvados.org/projects/arvados/wiki/SLURM_integration#Limited-nice-values-SLURM-15", job.uuid, slurm15NiceLimit)
job.hitNiceLimit = true
}
}
stdout, stderr := &bytes.Buffer{}, &bytes.Buffer{}
cmd.Stdout, cmd.Stderr = stdout, stderr
if err := cmd.Run(); err != nil {
- log.Printf("Error running %q %q: %s %q", cmd.Path, cmd.Args, err, stderr.String())
+ sqc.Logger.Warnf("Error running %q %q: %s %q", cmd.Path, cmd.Args, err, stderr.String())
return
}
var uuid, state, reason string
var n, p int64
if _, err := fmt.Sscan(line, &uuid, &n, &p, &state, &reason); err != nil {
- log.Printf("warning: ignoring unparsed line in squeue output: %q", line)
+ sqc.Logger.Warnf("ignoring unparsed line in squeue output: %q", line)
continue
}
// "launch failed requeued held" seems to be
// another manifestation of this problem,
// resolved the same way.
- log.Printf("releasing held job %q (priority=%d, state=%q, reason=%q)", uuid, p, state, reason)
+ sqc.Logger.Printf("releasing held job %q (priority=%d, state=%q, reason=%q)", uuid, p, state, reason)
sqc.Slurm.Release(uuid)
} else if state != "RUNNING" && p <= 2*slurm15NiceLimit && replacing.wantPriority > 0 {
- log.Printf("warning: job %q has low priority %d, nice %d, state %q, reason %q", uuid, p, n, state, reason)
+ sqc.Logger.Warnf("job %q has low priority %d, nice %d, state %q, reason %q", uuid, p, n, state, reason)
}
}
sqc.lock.Lock()
import (
"time"
+ "github.com/Sirupsen/logrus"
. "gopkg.in/check.v1"
)
queue: uuids[0] + " 10000 4294000000 PENDING Resources\n" + uuids[1] + " 10000 4294000111 PENDING Resources\n" + uuids[2] + " 10000 0 PENDING BadConstraints\n",
}
sqc := &SqueueChecker{
+ Logger: logrus.StandardLogger(),
Slurm: slurm,
Period: time.Hour,
}
queue: test.squeue,
}
sqc := &SqueueChecker{
+ Logger: logrus.StandardLogger(),
Slurm: slurm,
PrioritySpread: test.spread,
Period: time.Hour,
rejectNice10K: true,
}
sqc := &SqueueChecker{
+ Logger: logrus.StandardLogger(),
Slurm: slurm,
PrioritySpread: 1,
Period: time.Hour,
slurm := &slurmFake{}
sqc := &SqueueChecker{
+ Logger: logrus.StandardLogger(),
Slurm: slurm,
Period: time.Hour,
}