diff --git a/README.md b/README.md index 954491f..189585b 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,32 @@ Automatically set up Grafana organizations based on information from the [APPUiO Control API](https://kb.vshn.ch/appuio-cloud/references/architecture/control-api.html). +## Design Considerations + +There are a bunch of issues this operator needs to overcome. These issues have a large impact on how the operator is implemented. + +### Systematic Issues + +* Changes can happen both in the APPUiO Control API and on Grafana. Hence it is not sufficient to observe the APPUiO Control API and only sync when its data changes; instead in order to fix potential misconfiguration in Grafana this operator runs near continuously. + +### Issues with the APPUiO Control API + +* The Control API has no notion of "admin" users, i.e. users which should have access to everything. The configuration option `CONTROL_API_ADMIN_ORG` allows to specify a Control API organization, and all users who have access to that will be treated as admins. +* The `Organization` and `User` types use different GroupVersions. It seems that this cannot be implemented using a single k8s.io client; instead we need two client instances for the Control API. +* The `OrganizationMember` field `spec.userRefs` contains a lot of invalid values, including values with incorrect syntax. These must be filtered out. + +### Issues with Grafana + +* When a new user is created in Grafana (either by logging in via Keycloak or explicitly created via API), Grafana's `auto_assign_org` "feature" automatically gives the user permission to some organization (whichever is configured). This is almost never what we want. To work around this: + * It would be possible to disable `auto_assign_org`, but then Grafana would create a new organization for every new user, which would be even worse. + * We create a Grafana user for every known user, even if the user doesn't have permission to view anything. Thus: When the user logs in for the first time it is already known to Grafana, preventing the `auto_assign_org` feature from kicking in. + * When we create a Grafana user via API we can specify which organization it should be assigned to. We set this to an organization that the user is allowed to access whenever possible. + * We manage the users of org ID 1, which is the one where users might get assigned to by accident, thereby fixing any wrong permissions. +* Because of the `auto_assign_org` "feature" we have to create the organizations before we can create users +* Because the `grafana-api-golang-client` implementation is incomplete and can't easily be extended due to limitations of the Go language we can't sync permissions when we sync users. Thus syncing permissions happens in an extra step. +* The Grafana API often ignores the OrgID JSON field. The only workaround for this is to set the HTTP header `x-grafana-org-id`. We try to avoid that. +* The `grafana-api-golang-client` implementation doesn't allow setting the `x-grafana-org-id` header without creating a new client instance, which wreaks havoc with the connection pooling and reuse. Thus we try to avoid that. + ## Development Environment Setup In order to develop the operator you need: diff --git a/gen-dev-env.sh b/gen-dev-env.sh index 335376d..7e8e95b 100755 --- a/gen-dev-env.sh +++ b/gen-dev-env.sh @@ -16,4 +16,5 @@ echo "export CONTROL_API_URL=https://api.appuio.cloud" >> env echo -n "export CONTROL_API_TOKEN=" >> env kubectl --kubeconfig control.kubeconfig get secret grafana-organizations-operator -ojsonpath='{.data.token}' | base64 -d >> env echo "" >> env +echo "export CONTROL_API_ADMIN_ORG=vshn" >> env echo "done" diff --git a/main.go b/main.go index 0a561f7..495fe6c 100644 --- a/main.go +++ b/main.go @@ -20,29 +20,44 @@ import ( ) var ( - ControlApiToken string - ControlApiUrl string - GrafanaUrl string - GrafanaUsername string - GrafanaPassword string + ControlApiToken string + ControlApiUrl string + ControlApiAdminOrg string + GrafanaUrl string + GrafanaUsername string + GrafanaPassword string + AdminExtraOrgs = [1]int{1} ) func main() { ControlApiUrl = os.Getenv("CONTROL_API_URL") + ControlApiAdminOrg = os.Getenv("CONTROL_API_ADMIN_ORG") ControlApiToken = os.Getenv("CONTROL_API_TOKEN") + ControlApiTokenHidden := "" + if ControlApiToken != "" { + ControlApiTokenHidden = "***hidden***" + } + GrafanaUrl = os.Getenv("GRAFANA_URL") GrafanaUsername = os.Getenv("GRAFANA_USERNAME") if GrafanaUsername == "" { GrafanaUsername = os.Getenv("admin-user") // env variable name used by Grafana Helm chart. And yes using '-' is stupid because of compatibility issues with some shells. } GrafanaPassword = os.Getenv("GRAFANA_PASSWORD") + GrafanaPasswordHidden := "" if GrafanaPassword == "" { GrafanaPassword = os.Getenv("admin-password") // env variable name used by Grafana Helm chart. And yes using '-' is stupid because of compatibility issues with some shells. } + if GrafanaPassword != "" { + GrafanaPasswordHidden = "***hidden***" + } klog.Infof("CONTROL_API_URL: %s\n", ControlApiUrl) + klog.Infof("CONTROL_API_ADMIN_ORG: %s\n", ControlApiAdminOrg) + klog.Infof("CONTROL_API_TOKEN: %s\n", ControlApiTokenHidden) klog.Infof("GRAFANA_URL: %s\n", GrafanaUrl) klog.Infof("GRAFANA_USERNAME: %s\n", GrafanaUsername) + klog.Infof("GRAFANA_PASSWORD: %s\n", GrafanaPasswordHidden) // Because of the strange design of the k8s client we actually need two client objects, which both internally use the same httpClient. // To make this work we also need three (!) config objects, a common one for the httpClient and one for each k8s client. @@ -100,7 +115,7 @@ func main() { json.Unmarshal(db, &dashboard) klog.Info("Starting initial sync...") - err = controller.Reconcile(ctx, organizationAppuioIoClient, appuioIoClient, grafanaConfig, GrafanaUrl, dashboard) + err = controller.Reconcile(ctx, organizationAppuioIoClient, appuioIoClient, ControlApiAdminOrg, grafanaConfig, GrafanaUrl, dashboard) if err != nil { klog.Errorf("Could not do initial reconciliation: %v\n", err) os.Exit(1) @@ -112,7 +127,7 @@ func main() { case <-ctx.Done(): os.Exit(0) } - err = controller.Reconcile(ctx, organizationAppuioIoClient, appuioIoClient, grafanaConfig, GrafanaUrl, dashboard) + err = controller.Reconcile(ctx, organizationAppuioIoClient, appuioIoClient, ControlApiAdminOrg, grafanaConfig, GrafanaUrl, dashboard) if err != nil { klog.Errorf("Could not reconcile (will retry): %v\n", err) } diff --git a/pkg/reconcile.go b/pkg/reconcile.go index a949fc2..c652d2e 100644 --- a/pkg/reconcile.go +++ b/pkg/reconcile.go @@ -12,13 +12,13 @@ var ( interruptedError = errors.New("interrupted") ) -func Reconcile(ctx context.Context, organizationAppuioIoClient *rest.RESTClient, appuioIoClient *rest.RESTClient, grafanaConfig grafana.Config, grafanaUrl string, dashboard map[string]interface{}) error { +func Reconcile(ctx context.Context, organizationAppuioIoClient *rest.RESTClient, appuioIoClient *rest.RESTClient, adminOrg string, grafanaConfig grafana.Config, grafanaUrl string, dashboard map[string]interface{}) error { grafanaOrgsMap, err := reconcileAllOrgs(ctx, organizationAppuioIoClient, grafanaConfig, grafanaUrl, dashboard) if err != nil { return err } - uidToGrafanaOrgs, err := getControlApiUserOrganizationsMap(ctx, appuioIoClient, grafanaOrgsMap) + uidToGrafanaOrgs, adminUids, err := getControlApiUserOrganizationsMap(ctx, appuioIoClient, grafanaOrgsMap, adminOrg) if err != nil { return err } @@ -29,7 +29,7 @@ func Reconcile(ctx context.Context, organizationAppuioIoClient *rest.RESTClient, return err } - err = reconcilePermissions(ctx, grafanaOrgsMap, uidToGrafanaOrgs, grafanaConfig, grafanaUrl) + err = reconcilePermissions(ctx, grafanaOrgsMap, uidToGrafanaOrgs, adminUids, grafanaConfig, grafanaUrl) if err != nil { return err } @@ -38,11 +38,12 @@ func Reconcile(ctx context.Context, organizationAppuioIoClient *rest.RESTClient, } // Generate map with all organizations per user. Key is the user ID, value is an array of pointers to Grafana organizations -func getControlApiUserOrganizationsMap(ctx context.Context, appuioIoClient *rest.RESTClient, grafanaOrgsMap map[string]*grafana.Org) (map[string][]*grafana.Org, error) { +func getControlApiUserOrganizationsMap(ctx context.Context, appuioIoClient *rest.RESTClient, grafanaOrgsMap map[string]*grafana.Org, adminOrg string) (map[string][]*grafana.Org, []string, error) { appuioControlApiOrganizationMembers := controlapi.OrganizationMembersList{} + adminUids := make([]string, 0) err := appuioIoClient.Get().Resource("OrganizationMembers").Do(ctx).Into(&appuioControlApiOrganizationMembers) if err != nil { - return nil, err + return nil, nil, err } userOrganizations := make(map[string][]*grafana.Org) @@ -56,7 +57,10 @@ func getControlApiUserOrganizationsMap(ctx context.Context, appuioIoClient *rest if ok { userOrganizations[userRef.Name] = append(userOrganizations[userRef.Name], grafanaOrg) } + if memberlist.Namespace == adminOrg { + adminUids = append(adminUids, userRef.Name) + } } } - return userOrganizations, nil + return userOrganizations, adminUids, nil } diff --git a/pkg/reconcilePermissions.go b/pkg/reconcilePermissions.go index 17636ed..da57806 100644 --- a/pkg/reconcilePermissions.go +++ b/pkg/reconcilePermissions.go @@ -5,9 +5,10 @@ import ( grafana "github.com/grafana/grafana-api-golang-client" "github.com/hashicorp/go-cleanhttp" "k8s.io/klog/v2" + "k8s.io/utils/strings/slices" ) -func reconcilePermissions(ctx context.Context, grafanaOrgsMap map[string]*grafana.Org, uidToGrafanaOrgs map[string][]*grafana.Org, grafanaConfig grafana.Config, grafanaUrl string) error { +func reconcilePermissions(ctx context.Context, grafanaOrgsMap map[string]*grafana.Org, uidToGrafanaOrgs map[string][]*grafana.Org, adminUids []string, grafanaConfig grafana.Config, grafanaUrl string) error { // We have to iterate over grafanaOrgs here, hence we need to invert our uid <-> grafanaOrg association table grafanaOrgToUids := make(map[*grafana.Org][]string) @@ -36,7 +37,9 @@ func reconcilePermissions(ctx context.Context, grafanaOrgsMap map[string]*grafan } initialOrgUsersMap := make(map[string]grafana.OrgUser) for _, initialOrgUser := range initialOrgUsers { - initialOrgUsersMap[initialOrgUser.Login] = initialOrgUser + if initialOrgUser.Login != "admin" { // ignore admin + initialOrgUsersMap[initialOrgUser.Login] = initialOrgUser + } } desiredOrgUsers, ok := grafanaOrgToUids[org] @@ -46,10 +49,22 @@ func reconcilePermissions(ctx context.Context, grafanaOrgsMap map[string]*grafan // Check and if necessary add desired users for _, desiredOrgUser := range desiredOrgUsers { - if _, ok := initialOrgUsersMap[desiredOrgUser]; !ok { - klog.Infof("User '%s' needs to have access to org '%s' (%d), adding", desiredOrgUser, org.Name, org.ID) + if initialOrgUser, ok := initialOrgUsersMap[desiredOrgUser]; ok { + // Permission exists + if initialOrgUser.Role != "Viewer" && initialOrgUser.Role != "Editor" && !slices.Contains(adminUids, initialOrgUser.Login) { + // A normal user can only have roles "Viewer" and "Editor". Other roles would give the user too many permissions, e.g. to change the data source which would be a security issue. + err := grafanaClient.UpdateOrgUser(org.ID, initialOrgUser.UserID, "Editor") + if err != nil { + // This can happen due to race conditions, hence just a warning + klog.Warning(err) + } + } + } else { + // Permission missing + klog.Infof("User '%s' should have access to org '%s' (%d), adding", desiredOrgUser, org.Name, org.ID) err := grafanaClient.AddOrgUser(org.ID, desiredOrgUser, "Editor") if err != nil { + // This can happen due to race conditions, hence just a warning klog.Warning(err) } } @@ -62,11 +77,31 @@ func reconcilePermissions(ctx context.Context, grafanaOrgsMap map[string]*grafan } } + // Check and if necessary add desired admins + for _, desiredOrgAdmin := range adminUids { + if _, ok := initialOrgUsersMap[desiredOrgAdmin]; !ok { + klog.Infof("User '%s' should be admin of org '%s' (%d), adding", desiredOrgAdmin, org.Name, org.ID) + err := grafanaClient.AddOrgUser(org.ID, desiredOrgAdmin, "Admin") + if err != nil { + // This can happen due to race conditions, hence just a warning + klog.Warning(err) + } + } + delete(initialOrgUsersMap, desiredOrgAdmin) + + select { + case <-ctx.Done(): + return interruptedError + default: + } + } + // Remove all the users that are left in initialOrgUsersMap for _, undesiredOrgUser := range initialOrgUsersMap { klog.Infof("User '%s' (%d) must not have access to org '%s' (%d), removing", undesiredOrgUser, undesiredOrgUser.UserID, org.Name, org.ID) err := grafanaClient.RemoveOrgUser(org.ID, undesiredOrgUser.UserID) if err != nil { + // This can happen due to race conditions, hence just a warning klog.Warning(err) } diff --git a/pkg/reconcileUser.go b/pkg/reconcileUser.go index be7082b..bbf2496 100644 --- a/pkg/reconcileUser.go +++ b/pkg/reconcileUser.go @@ -2,9 +2,13 @@ package controller import ( "crypto/rand" + "fmt" controlapi "github.com/appuio/control-api/apis/v1" grafana "github.com/grafana/grafana-api-golang-client" + "io" + "k8s.io/apimachinery/pkg/util/json" "math/big" + "net/http" ) func generatePassword() (string, error) { @@ -32,21 +36,52 @@ func createUser(client *grafana.Client, user controlapi.User, orgs []*grafana.Or Login: user.Name, Name: user.Status.DisplayName, Password: password, - // This is the initial organization of the user. We have to set this, otherwise the user will be assigned to org ID 1, which is not what we want. We'll fix the permissions later. - // Interestingly here the OrgID field works and the HTTP header 'x-grafana-org-id' isn't required, in contrast to the addOrgUser API call. - OrgID: orgs[0].ID, + } + if len(orgs) > 0 { + // By default Grafana gives the user permission to org ID 1, which is likely not correct. + // This isn't a huge problem because we'll strip the permission in the next step, but still we'd like to avoid + // this race condition and thus we tell Grafana to use another organization, if available. + grafanaUser.OrgID = orgs[0].ID } grafanaUser.ID, err = client.CreateUser(grafanaUser) if err != nil { return nil, err } - // we immediately remove the user from the org (!) because who knows what permissions the user got on that org (can't be controlled when creating the user) - // yes this is stupid but that's how Grafana works - err = client.RemoveOrgUser(orgs[0].ID, grafanaUser.ID) + h := http.Client{} + url := fmt.Sprintf("https://operator-dev-grafana.apps.cloudscale-lpg-2.appuio.cloud/api/users/%d/orgs", grafanaUser.ID) + req, err := http.NewRequest("GET", url, nil) + if err != nil { + return nil, err + } + req.SetBasicAuth("admin", "...") + r, err := h.Do(req) + defer r.Body.Close() + if err != nil { + return nil, err + } + body, err := io.ReadAll(r.Body) if err != nil { return nil, err } + type UserOrg struct { + OrgID int64 `json:"orgId"` + Name string `json:"name"` + Role string `json:"role"` + } + userOrgs := make([]UserOrg, 0) + + json.Unmarshal(body, &userOrgs) + + for _, userOrg := range userOrgs { + // we immediately remove the user from the automatically assigned org because who knows what permissions the user got on that org (can't be controlled when creating the user) + // yes this is stupid but that's how Grafana works + err = client.RemoveOrgUser(userOrg.OrgID, grafanaUser.ID) + if err != nil { + return nil, err + } + } + return &grafanaUser, nil } diff --git a/pkg/reconcileUsers.go b/pkg/reconcileUsers.go index 44b0196..6af4416 100644 --- a/pkg/reconcileUsers.go +++ b/pkg/reconcileUsers.go @@ -29,9 +29,12 @@ func reconcileUsers(ctx context.Context, users map[string]controlapi.User, uidTo for _, user := range users { orgs, ok := uidToGrafanaOrgs[user.Name] - if !ok || len(orgs) == 0 { - // ignore users that don't have access to any org - continue + if !ok { + // Even if the user doesn't have access to any org we still need to create it. + // This is to work around Grafana's auto_assign_org "feature": If the user were to log in via + // Keycloak Grafana would create a user and give it permission to see org ID 1. + // We can prevent this by pre-emptively creating the user and stripping it of all permissions. + orgs = make([]*grafana.Org, 0) } var grafanaUser *grafana.User @@ -53,7 +56,6 @@ func reconcileUsers(ctx context.Context, users map[string]controlapi.User, uidTo klog.Infof("User '%s' is missing, adding", user.Name) grafanaUser, err = createUser(client, user, orgs) if err != nil { - //return err // for now just continue in case errors happen klog.Error(err) continue