diff --git a/api/custom/exports.go b/api/custom/exports.go new file mode 100644 index 0000000000000000000000000000000000000000..3895c26826dd77e05e09632a5094a39cbfe533f8 --- /dev/null +++ b/api/custom/exports.go @@ -0,0 +1,258 @@ +package custom + +import ( + "bytes" + "net/http" + "strconv" + + "github.com/dustin/go-humanize" + "github.com/gorilla/mux" + "github.com/sirupsen/logrus" + "github.com/turt2live/matrix-media-repo/api" + "github.com/turt2live/matrix-media-repo/api/r0" + "github.com/turt2live/matrix-media-repo/common/config" + "github.com/turt2live/matrix-media-repo/controllers/data_controller" + "github.com/turt2live/matrix-media-repo/storage" + "github.com/turt2live/matrix-media-repo/storage/datastore" + "github.com/turt2live/matrix-media-repo/templating" + "github.com/turt2live/matrix-media-repo/util" +) + +type ExportStarted struct { + ExportID string `json:"export_id"` + TaskID int `json:"task_id"` +} + +type ExportPartMetadata struct { + Index int `json:"index"` + SizeBytes int64 `json:"size"` + FileName string `json:"name"` +} + +type ExportMetadata struct { + Entity string `json:"entity"` + Parts []*ExportPartMetadata `json:"parts"` +} + +func ExportUserData(r *http.Request, log *logrus.Entry, user api.UserInfo) interface{} { + if !config.Get().Archiving.Enabled { + return api.BadRequest("archiving is not enabled") + } + + isAdmin := util.IsGlobalAdmin(user.UserId) || user.IsShared + if !config.Get().Archiving.SelfService && !isAdmin { + return api.AuthFailed() + } + + includeData := r.URL.Query().Get("include_data") != "false" + s3urls := r.URL.Query().Get("s3_urls") != "false" + + params := mux.Vars(r) + + userId := params["userId"] + + if !isAdmin && user.UserId != userId { + return api.BadRequest("cannot export data for another user") + } + + log = log.WithFields(logrus.Fields{ + "exportUserId": userId, + "includeData": includeData, + "s3urls": s3urls, + }) + task, exportId, err := data_controller.StartUserExport(userId, s3urls, includeData, log) + if err != nil { + log.Error(err) + return api.InternalServerError("fatal error starting export") + } + + return &api.DoNotCacheResponse{Payload: &ExportStarted{ + TaskID: task.ID, + ExportID: exportId, + }} +} + +func ViewExport(r *http.Request, log *logrus.Entry, user api.UserInfo) interface{} { + if !config.Get().Archiving.Enabled { + return api.BadRequest("archiving is not enabled") + } + + params := mux.Vars(r) + + exportId := params["exportId"] + log = log.WithFields(logrus.Fields{ + "exportId": exportId, + }) + + exportDb := storage.GetDatabase().GetExportStore(r.Context(), log) + + exportInfo, err := exportDb.GetExportMetadata(exportId) + if err != nil { + log.Error(err) + return api.InternalServerError("failed to get metadata") + } + + parts, err := exportDb.GetExportParts(exportId) + if err != nil { + log.Error(err) + return api.InternalServerError("failed to get export parts") + } + + template, err := templating.GetTemplate("view_export") + if err != nil { + log.Error(err) + return api.InternalServerError("failed to get template") + } + + model := &templating.ViewExportModel{ + ExportID: exportInfo.ExportID, + Entity: exportInfo.Entity, + ExportParts: make([]*templating.ViewExportPartModel, 0), + } + for _, p := range parts { + model.ExportParts = append(model.ExportParts, &templating.ViewExportPartModel{ + ExportID: exportInfo.ExportID, + Index: p.Index, + FileName: p.FileName, + SizeBytes: p.SizeBytes, + SizeBytesHuman: humanize.Bytes(uint64(p.SizeBytes)), + }) + } + + html := bytes.Buffer{} + err = template.Execute(&html, model) + if err != nil { + log.Error(err) + return api.InternalServerError("failed to render template") + } + + return &api.HtmlResponse{HTML: string(html.Bytes())} +} + +func GetExportMetadata(r *http.Request, log *logrus.Entry, user api.UserInfo) interface{} { + if !config.Get().Archiving.Enabled { + return api.BadRequest("archiving is not enabled") + } + + params := mux.Vars(r) + + exportId := params["exportId"] + log = log.WithFields(logrus.Fields{ + "exportId": exportId, + }) + + exportDb := storage.GetDatabase().GetExportStore(r.Context(), log) + + exportInfo, err := exportDb.GetExportMetadata(exportId) + if err != nil { + log.Error(err) + return api.InternalServerError("failed to get metadata") + } + + parts, err := exportDb.GetExportParts(exportId) + if err != nil { + log.Error(err) + return api.InternalServerError("failed to get export parts") + } + + metadata := &ExportMetadata{ + Entity: exportInfo.Entity, + Parts: make([]*ExportPartMetadata, 0), + } + for _, p := range parts { + metadata.Parts = append(metadata.Parts, &ExportPartMetadata{ + Index: p.Index, + SizeBytes: p.SizeBytes, + FileName: p.FileName, + }) + } + + return &api.DoNotCacheResponse{Payload: metadata} +} + +func DownloadExportPart(r *http.Request, log *logrus.Entry, user api.UserInfo) interface{} { + if !config.Get().Archiving.Enabled { + return api.BadRequest("archiving is not enabled") + } + + params := mux.Vars(r) + + exportId := params["exportId"] + partId, err := strconv.ParseInt(params["partId"], 10, 64) + if err != nil { + log.Error(err) + return api.BadRequest("invalid part index") + } + + log = log.WithFields(logrus.Fields{ + "exportId": exportId, + "partId": partId, + }) + + db := storage.GetDatabase().GetExportStore(r.Context(), log) + part, err := db.GetExportPart(exportId, int(partId)) + if err != nil { + log.Error(err) + return api.InternalServerError("failed to get part") + } + + s, err := datastore.DownloadStream(r.Context(), log, part.DatastoreID, part.Location) + if err != nil { + log.Error(err) + return api.InternalServerError("failed to start download") + } + + return &r0.DownloadMediaResponse{ + ContentType: "application/gzip", + SizeBytes: part.SizeBytes, + Data: s, + Filename: part.FileName, + } +} + +func DeleteExport(r *http.Request, log *logrus.Entry, user api.UserInfo) interface{} { + if !config.Get().Archiving.Enabled { + return api.BadRequest("archiving is not enabled") + } + + params := mux.Vars(r) + + exportId := params["exportId"] + + log = log.WithFields(logrus.Fields{ + "exportId": exportId, + }) + + db := storage.GetDatabase().GetExportStore(r.Context(), log) + + log.Info("Getting information on which parts to delete") + parts, err := db.GetExportParts(exportId) + if err != nil { + log.Error(err) + return api.InternalServerError("failed to delete export") + } + + for _, part := range parts { + log.Info("Locating datastore: " + part.DatastoreID) + ds, err := datastore.LocateDatastore(r.Context(), log, part.DatastoreID) + if err != nil { + log.Error(err) + return api.InternalServerError("failed to delete export") + } + + log.Info("Deleting object: " + part.Location) + err = ds.DeleteObject(part.Location) + if err != nil { + log.Warn(err) + } + } + + log.Info("Purging export from database") + err = db.DeleteExportAndParts(exportId) + if err != nil { + log.Error(err) + return api.InternalServerError("failed to delete export") + } + + return api.EmptyResponse{} +} diff --git a/api/responses.go b/api/responses.go index 5fd04ef0184b6b339b6cc3860dd0df65d4e7161f..f439191eef6aa94687a83f46f20ee56f58926581 100644 --- a/api/responses.go +++ b/api/responses.go @@ -8,6 +8,10 @@ type DoNotCacheResponse struct { Payload interface{} } +type HtmlResponse struct { + HTML string +} + type ErrorResponse struct { Code string `json:"errcode"` Message string `json:"error"` diff --git a/api/webserver/route_handler.go b/api/webserver/route_handler.go index 8c5ff41de4b223eb428e1e1bdf0a079d3a047b71..34a3f6fc64f3e8b4da30a2dc2f07cd388197c9d2 100644 --- a/api/webserver/route_handler.go +++ b/api/webserver/route_handler.go @@ -1,6 +1,7 @@ package webserver import ( + "bytes" "encoding/json" "fmt" "io" @@ -166,6 +167,17 @@ func (h handler) ServeHTTP(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "image/png") io.Copy(w, result.Avatar) return // Prevent sending conflicting responses + case *api.HtmlResponse: + metrics.HttpResponses.With(prometheus.Labels{ + "host": r.Host, + "action": h.action, + "method": r.Method, + "statusCode": strconv.Itoa(http.StatusOK), + }).Inc() + w.Header().Set("Content-Type", "text/html") + w.Header().Set("Content-Security-Policy", "") // We're serving HTML, so take away the CSP + io.Copy(w, bytes.NewBuffer([]byte(result.HTML))) + return default: break } diff --git a/api/webserver/webserver.go b/api/webserver/webserver.go index bf09ae365bf6936e47a14c36886e7a44d60c93b5..947c613049c7f5b5554675a8e9cf191455316860 100644 --- a/api/webserver/webserver.go +++ b/api/webserver/webserver.go @@ -54,6 +54,11 @@ func Init() { getBackgroundTaskHandler := handler{api.RepoAdminRoute(custom.GetTask), "get_background_task", counter, false} listAllBackgroundTasksHandler := handler{api.RepoAdminRoute(custom.ListAllTasks), "list_all_background_tasks", counter, false} listUnfinishedBackgroundTasksHandler := handler{api.RepoAdminRoute(custom.ListUnfinishedTasks), "list_unfinished_background_tasks", counter, false} + exportUserDataHandler := handler{api.AccessTokenRequiredRoute(custom.ExportUserData), "export_user_data", counter, false} + viewExportHandler := handler{api.AccessTokenOptionalRoute(custom.ViewExport), "view_export", counter, false} + getExportMetadataHandler := handler{api.AccessTokenOptionalRoute(custom.GetExportMetadata), "get_export_metadata", counter, false} + downloadExportPartHandler := handler{api.AccessTokenOptionalRoute(custom.DownloadExportPart), "download_export_part", counter, false} + deleteExportHandler := handler{api.AccessTokenOptionalRoute(custom.DeleteExport), "delete_export", counter, false} routes := make(map[string]route) versions := []string{"r0", "v1", "unstable"} // r0 is typically clients and v1 is typically servers. v1 is deprecated. @@ -90,6 +95,11 @@ func Init() { routes["/_matrix/media/"+version+"/admin/tasks/{taskId:[0-9]+}"] = route{"GET", getBackgroundTaskHandler} routes["/_matrix/media/"+version+"/admin/tasks/all"] = route{"GET", listAllBackgroundTasksHandler} routes["/_matrix/media/"+version+"/admin/tasks/unfinished"] = route{"GET", listUnfinishedBackgroundTasksHandler} + routes["/_matrix/media/"+version+"/admin/user/{userId:[^/]+}/export"] = route{"POST", exportUserDataHandler} + routes["/_matrix/media/"+version+"/admin/export/{exportId:[a-zA-Z0-9.:\\-_]+}/view"] = route{"GET", viewExportHandler} + routes["/_matrix/media/"+version+"/admin/export/{exportId:[a-zA-Z0-9.:\\-_]+}/metadata"] = route{"GET", getExportMetadataHandler} + routes["/_matrix/media/"+version+"/admin/export/{exportId:[a-zA-Z0-9.:\\-_]+}/part/{partId:[0-9]+}"] = route{"GET", downloadExportPartHandler} + routes["/_matrix/media/"+version+"/admin/export/{exportId:[a-zA-Z0-9.:\\-_]+}/delete"] = route{"DELETE", deleteExportHandler} // Routes that we should handle but aren't in the media namespace (synapse compat) routes["/_matrix/client/"+version+"/admin/purge_media_cache"] = route{"POST", purgeRemote} diff --git a/cmd/media_repo/main.go b/cmd/media_repo/main.go index 0d674d735944081e6225e9de017634277fa438f3..7c485c57a284d2e9171a4606c68fd4e17af2a5a4 100644 --- a/cmd/media_repo/main.go +++ b/cmd/media_repo/main.go @@ -19,11 +19,13 @@ import ( func main() { configPath := flag.String("config", "media-repo.yaml", "The path to the configuration") - migrationsPath := flag.String("migrations", "./migrations", "The absolute path the migrations folder") + migrationsPath := flag.String("migrations", "./migrations", "The absolute path for the migrations folder") + templatesPath := flag.String("templates", "./templates", "The absolute path for the templates folder") flag.Parse() config.Path = *configPath config.Runtime.MigrationsPath = *migrationsPath + config.Runtime.TemplatesPath = *templatesPath err := logging.Setup(config.Get().General.LogDirectory) if err != nil { diff --git a/common/config/config.go b/common/config/config.go index 8a4320eb8e39f59bdd604b8eeb0c998076573bfc..9bef36f4db66c77652610f412f190547aaef9fdb 100644 --- a/common/config/config.go +++ b/common/config/config.go @@ -11,6 +11,7 @@ import ( type runtimeConfig struct { MigrationsPath string + TemplatesPath string } var Runtime = &runtimeConfig{} @@ -39,6 +40,12 @@ type DatabaseConfig struct { Pool *DbPoolConfig `yaml:"pool"` } +type ArchivingConfig struct { + Enabled bool `yaml:"enabled"` + SelfService bool `yaml:"selfService"` + TargetBytesPerPart int64 `yaml:"targetBytesPerPart"` +} + type UploadsConfig struct { StoragePaths []string `yaml:"storagePaths,flow"` // deprecated MaxSizeBytes int64 `yaml:"maxBytes"` @@ -143,6 +150,7 @@ type MediaRepoConfig struct { Admins []string `yaml:"admins,flow"` Database *DatabaseConfig `yaml:"database"` DataStores []DatastoreConfig `yaml:"datastores"` + Archiving *ArchivingConfig `yaml:"archiving"` Uploads *UploadsConfig `yaml:"uploads"` Downloads *DownloadsConfig `yaml:"downloads"` Thumbnails *ThumbnailsConfig `yaml:"thumbnails"` @@ -234,6 +242,11 @@ func NewDefaultConfig() *MediaRepoConfig { Homeservers: []*HomeserverConfig{}, Admins: []string{}, DataStores: []DatastoreConfig{}, + Archiving: &ArchivingConfig{ + Enabled: true, + SelfService: false, + TargetBytesPerPart: 209715200, // 200mb + }, Uploads: &UploadsConfig{ MaxSizeBytes: 104857600, // 100mb MinSizeBytes: 100, diff --git a/common/media_kinds.go b/common/media_kinds.go index 0f1b7661bfecfd30ffc28eaea5a953511373d01c..4ea68567cc3b44407bef1d0f16b80903418acb9c 100644 --- a/common/media_kinds.go +++ b/common/media_kinds.go @@ -3,5 +3,6 @@ package common const KindLocalMedia = "local_media" const KindRemoteMedia = "remote_media" const KindThumbnails = "thumbnails" +const KindArchives = "archives" var AllKinds = []string{KindLocalMedia, KindRemoteMedia, KindThumbnails} diff --git a/config.sample.yaml b/config.sample.yaml index 45a1d5c6949cec01c1189cd9609f99c52a10c86e..351da2efb61573ff86d6c95584cf73bb4b0efbd0 100644 --- a/config.sample.yaml +++ b/config.sample.yaml @@ -65,17 +65,22 @@ sharedSecretAuth: datastores: - type: file enabled: false # Enable this to set up data storage. - # Datastores can be split into three areas when handling uploads: thumbnails, remote_media, - # and local_media. Media is still de-duplicated across all datastores (local content which - # duplicates remote content will re-use the remote content's location). This option is useful - # if your datastore is becoming very large, or if you want faster storage for a particular - # kind of media. + # Datastores can be split into many areas when handling uploads. Media is still de-duplicated + # across all datastores (local content which duplicates remote content will re-use the remote + # content's location). This option is useful if your datastore is becoming very large, or if + # you want faster storage for a particular kind of media. + # + # The kinds available are: + # thumbnails - Used to store thumbnails of media (local and remote). + # remote_media - Original copies of remote media (servers not configured by this repo). + # local_media - Original uploads for local media. + # archives - Archives of content (GDPR and similar requests). forKinds: ["thumbnails"] opts: path: /var/matrix/media - type: s3 enabled: false # Enable this to set up s3 uploads - forKinds: ["thumbnails", "remote_media", "local_media"] + forKinds: ["thumbnails", "remote_media", "local_media", "archives"] opts: # The s3 uploader needs a temporary location to buffer files to reduce memory usage on # small file uploads. If the file size is unknown, the file is written to this location @@ -88,6 +93,21 @@ datastores: ssl: true bucketName: "your-media-bucket" +# Options for controlling archives. Archives are exports of a particular user's content for +# the purpose of GDPR or moving media to a different server. +archiving: + # Whether archiving is enabled or not. Default enabled. + enabled: true + # If true, users can request a copy of their own data. By default, only repository administrators + # can request a copy. + selfService: false + # The number of bytes to target per archive before breaking up the files. This is independent + # of any file upload limits and will require a similar amount of memory when performing an export. + # The file size is also a target, not a guarantee - it is possible to have files that are smaller + # or larger than the target. This is recommended to be approximately double the size of your + # file upload limit, provided there is enough memory available for the demand of exporting. + targetBytesPerPart: 209715200 # 200mb default + # The file upload settings for the media repository uploads: maxBytes: 104857600 # 100MB default, 0 to disable diff --git a/controllers/data_controller/export_controller.go b/controllers/data_controller/export_controller.go new file mode 100644 index 0000000000000000000000000000000000000000..45fe10fc02b89387280ee3c470a900c52eb6f3b1 --- /dev/null +++ b/controllers/data_controller/export_controller.go @@ -0,0 +1,310 @@ +package data_controller + +import ( + "archive/tar" + "bytes" + "compress/gzip" + "context" + "encoding/json" + "fmt" + "io" + "time" + + "github.com/dustin/go-humanize" + "github.com/sirupsen/logrus" + "github.com/turt2live/matrix-media-repo/common" + "github.com/turt2live/matrix-media-repo/common/config" + "github.com/turt2live/matrix-media-repo/storage" + "github.com/turt2live/matrix-media-repo/storage/datastore" + "github.com/turt2live/matrix-media-repo/storage/datastore/ds_s3" + "github.com/turt2live/matrix-media-repo/templating" + "github.com/turt2live/matrix-media-repo/types" + "github.com/turt2live/matrix-media-repo/util" +) + +type manifestRecord struct { + FileName string `json:"name"` + ArchivedName string `json:"file_name"` + SizeBytes int64 `json:"size_bytes"` + ContentType string `json:"content_type"` + S3Url string `json:"s3_url"` + Sha256 string `json:"sha256"` + Origin string `json:"origin"` + MediaId string `json:"media_id"` + CreatedTs int64 `json:"created_ts"` +} + +type manifest struct { + Version int `json:"version"` + UserId string `json:"user_id"` + CreatedTs int64 `json:"created_ts"` + Media map[string]*manifestRecord `json:"media"` +} + +func StartUserExport(userId string, s3urls bool, includeData bool, log *logrus.Entry) (*types.BackgroundTask, string, error) { + ctx := context.Background() + + exportId, err := util.GenerateRandomString(128) + if err != nil { + return nil, "", err + } + + db := storage.GetDatabase().GetMetadataStore(ctx, log) + task, err := db.CreateBackgroundTask("export_data", map[string]interface{}{ + "user_id": userId, + "include_s3_urls": s3urls, + "include_data": includeData, + "export_id": exportId, + }) + + if err != nil { + return nil, "", err + } + + go func() { + ds, err := datastore.PickDatastore(common.KindArchives, ctx, log) + if err != nil { + log.Error(err) + return + } + + mediaDb := storage.GetDatabase().GetMediaStore(ctx, log) + media, err := mediaDb.GetMediaByUser(userId) + if err != nil { + log.Error(err) + return + } + + exportDb := storage.GetDatabase().GetExportStore(ctx, log) + err = exportDb.InsertExport(exportId, userId) + if err != nil { + log.Error(err) + return + } + + var currentTar *tar.Writer + var currentTarBytes bytes.Buffer + part := 0 + parts := make([]*types.ObjectInfo, 0) + currentSize := int64(0) + + persistTar := func() error { + currentTar.Close() + + // compress + log.Info("Compressing tar file") + gzipBytes := bytes.Buffer{} + archiver := gzip.NewWriter(&gzipBytes) + archiver.Name = fmt.Sprintf("export-part-%d.tar", part) + _, err := io.Copy(archiver, util.BufferToStream(bytes.NewBuffer(currentTarBytes.Bytes()))) + if err != nil { + return err + } + archiver.Close() + + log.Info("Uploading compressed tar file") + buf := bytes.NewBuffer(gzipBytes.Bytes()) + size := int64(buf.Len()) + obj, err := ds.UploadFile(util.BufferToStream(buf), size, ctx, log) + if err != nil { + return err + } + parts = append(parts, obj) + + fname := fmt.Sprintf("export-part-%d.tgz", part) + err = exportDb.InsertExportPart(exportId, part, size, fname, ds.DatastoreId, obj.Location) + if err != nil { + return err + } + + return nil + } + + newTar := func() error { + if part > 0 { + log.Info("Persisting complete tar file") + err := persistTar() + if err != nil { + return err + } + } + + log.Info("Starting new tar file") + currentTarBytes = bytes.Buffer{} + currentTar = tar.NewWriter(¤tTarBytes) + part = part + 1 + currentSize = 0 + + return nil + } + + // Start the first tar file + log.Info("Creating first tar file") + err = newTar() + if err != nil { + log.Error(err) + return + } + + putFile := func(name string, size int64, creationTime time.Time, file io.Reader) error { + header := &tar.Header{ + Name: name, + Size: size, + Mode: int64(0644), + ModTime: creationTime, + } + err := currentTar.WriteHeader(header) + if err != nil { + log.Error("error writing header") + return err + } + + i, err := io.Copy(currentTar, file) + if err != nil { + log.Error("error writing file") + return err + } + + currentSize += i + + return nil + } + + archivedName := func(m *types.Media) string { + // TODO: Pick the right extension for the file type + return fmt.Sprintf("%s__%s.obj", m.Origin, m.MediaId) + } + + // Build a manifest first (JSON) + log.Info("Building manifest") + indexModel := &templating.ExportIndexModel{ + Entity: userId, + ExportID: exportId, + Media: make([]*templating.ExportIndexMediaModel, 0), + } + mediaManifest := make(map[string]*manifestRecord) + for _, m := range media { + s3url, err := ds_s3.GetS3URL(m.DatastoreId, m.Location) + if err != nil { + log.Warn(err) + } + mediaManifest[m.MxcUri()] = &manifestRecord{ + ArchivedName: archivedName(m), + FileName: m.UploadName, + SizeBytes: m.SizeBytes, + ContentType: m.ContentType, + S3Url: s3url, + Sha256: m.Sha256Hash, + Origin: m.Origin, + MediaId: m.MediaId, + CreatedTs: m.CreationTs, + } + indexModel.Media = append(indexModel.Media, &templating.ExportIndexMediaModel{ + ExportID: exportId, + ArchivedName: archivedName(m), + FileName: m.UploadName, + SizeBytes: m.SizeBytes, + SizeBytesHuman: humanize.Bytes(uint64(m.SizeBytes)), + Origin: m.Origin, + MediaID: m.MediaId, + Sha256Hash: m.Sha256Hash, + ContentType: m.ContentType, + UploadTs: m.CreationTs, + UploadDateHuman: util.FromMillis(m.CreationTs).Format(time.UnixDate), + }) + } + manifest := &manifest{ + Version: 1, + UserId: userId, + CreatedTs: util.NowMillis(), + Media: mediaManifest, + } + b, err := json.Marshal(manifest) + if err != nil { + log.Error(err) + return + } + + log.Info("Writing manifest") + err = putFile("manifest.json", int64(len(b)), time.Now(), util.BufferToStream(bytes.NewBuffer(b))) + if err != nil { + log.Error(err) + return + } + + if includeData { + log.Info("Building and writing index") + t, err := templating.GetTemplate("export_index") + if err != nil { + log.Error(err) + return + } + html := bytes.Buffer{} + err = t.Execute(&html, indexModel) + if err != nil { + log.Error(err) + return + } + err = putFile("index.html", int64(html.Len()), time.Now(), util.BufferToStream(bytes.NewBuffer(html.Bytes()))) + if err != nil { + log.Error(err) + return + } + + log.Info("Including data in the archive") + for _, m := range media { + log.Info("Downloading ", m.MxcUri()) + s, err := datastore.DownloadStream(ctx, log, m.DatastoreId, m.Location) + if err != nil { + log.Error(err) + continue + } + + log.Infof("Copying %s to memory", m.MxcUri()) + b := bytes.Buffer{} + _, err = io.Copy(&b, s) + if err != nil { + log.Error(err) + continue + } + s.Close() + s = util.BufferToStream(bytes.NewBuffer(b.Bytes())) + + log.Info("Archiving ", m.MxcUri()) + err = putFile(archivedName(m), m.SizeBytes, time.Unix(0, m.CreationTs*int64(time.Millisecond)), s) + if err != nil { + log.Error(err) + return + } + + if currentSize >= config.Get().Archiving.TargetBytesPerPart { + log.Info("Rotating tar") + err = newTar() + if err != nil { + log.Error(err) + return + } + } + } + } + + if currentSize > 0 { + log.Info("Persisting last tar") + err = persistTar() + if err != nil { + log.Error(err) + return + } + } + + log.Info("Finishing export task") + err = db.FinishedBackgroundTask(task.ID) + if err != nil { + log.Error(err) + log.Error("Failed to flag task as finished") + } + log.Info("Finished export") + }() + + return task, exportId, nil +} diff --git a/docs/admin.md b/docs/admin.md index 3da8650c8f28f2e00f862634f03e7a6c0166aaa0..5278ce1de3aa22ad0232dc553b3bb2b0d94a7033 100644 --- a/docs/admin.md +++ b/docs/admin.md @@ -310,3 +310,80 @@ The response is the status of the task: ``` **Note**: The `params` vary depending on the task. + +## Exporting/Importing data + +Exports (and therefore imports) are currently done on a per-user basis. This is primarily useful when moving users to new hosts or doing GDPR exports of user data. + +#### Exporting data for a user + +URL: `POST /_matrix/media/unstable/admin/user/<user ID>/export?include_data=true&s3_urls=true` + +Both query params are optional, and their default values are shown. If `include_data` is false, only metadata will be returned by the export. `s3_urls`, when true, includes the s3 URL to the media in the metadata if one is available. + +The response is a task ID and export ID to put into the 'view export' URL: + +```json +{ + "export_id": "abcdef", + "task_id": 12 +} +``` + +**Note**: the `export_id` will be included in the task's `params`. + +**Note**: the `export_id` should be treated as a secret/authentication token as it allows someone to download other people's data. + +#### Viewing an export + +After the task has been completed, the `export_id` can be used to download the content. + +URL: `GET /_matrix/media/unstable/admin/export/<export ID>/view` + +The response will be a webpage for the user to interact with. From this page, the user can say they've downloaded the export and delete it. + +#### Downloading an export (for scripts) + +Similar to viewing an export, an export may be downloaded to later be imported. + +Exports are split into several tar (gzipped) files and need to be downloaded individually. To get the list of files, call: + +`GET /_matrix/media/unstable/admin/export/<export ID>/metadata` + +which returns: + +```json +{ + "entity": "@travis:t2l.io", + "parts": [ + { + "index": 1, + "size": 1024000, + "name": "TravisR-part-1.tgz" + }, + { + "index": 2, + "size": 1024000, + "name": "TravisR-part-2.tgz" + } + ] +} +``` + +**Note**: the `name` demonstrated may be different and should not be parsed. The `size` is in bytes. + +Then one can call the following to download each part: + +`GET /_matrix/media/unstable/admin/export/<export ID>/part/<index>` + +#### Deleting an export + +After the export has been downloaded, it can be deleted. Note that this endpoint can be called by the user from the "view export" page. + +`DELETE /_matrix/media/unstable/admin/export/<export ID>` + +The response is an empty JSON object if successful. + +#### Importing a previous export + +Not yet implemented. diff --git a/go.mod b/go.mod index e9b59fcb18e9e853bd2b61be6fa5345eeaf32a06..db51ad7d1abd77acf022d98d58c1115d2c45144d 100644 --- a/go.mod +++ b/go.mod @@ -13,6 +13,7 @@ require ( github.com/didip/tollbooth v4.0.0+incompatible github.com/disintegration/imaging v1.2.4 github.com/djherbis/stream v0.0.0-20180113022625-311bd3bbfe82 + github.com/dustin/go-humanize v1.0.0 github.com/dyatlov/go-opengraph v0.0.0-20160203134303-41a3523719df github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a // indirect github.com/fastly/go-utils v0.0.0-20180712184237-d95a45783239 // indirect diff --git a/go.sum b/go.sum index e3e07f7c4188455d93a921d4f658c89950c25509..5eeb97cf4ebbd210e033234cbbe6fc464c0a2960 100644 --- a/go.sum +++ b/go.sum @@ -22,6 +22,8 @@ github.com/disintegration/imaging v1.2.4 h1:eJRPGef+mQ4WZ8cED/pqElxW4+79zBjJYTjY github.com/disintegration/imaging v1.2.4/go.mod h1:9B/deIUIrliYkyMTuXJd6OUFLcrZ2tf+3Qlwnaf/CjU= github.com/djherbis/stream v0.0.0-20180113022625-311bd3bbfe82 h1:RIpwzqf44uzYuaK6yfIp9m985kwVR82yvV77T7Cvkjk= github.com/djherbis/stream v0.0.0-20180113022625-311bd3bbfe82/go.mod h1:ZNVKPVRCmrwhCwQHZUpVHHrq2rtGLrG1t3T/TThYLP8= +github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo= +github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= github.com/dyatlov/go-opengraph v0.0.0-20160203134303-41a3523719df h1:/XZvBjuoyXsk7DRR9vI8JSRTqRS8f163L9bRdY5Sfw0= github.com/dyatlov/go-opengraph v0.0.0-20160203134303-41a3523719df/go.mod h1:nYia/MIs9OyvXXYboPmNOj0gVWo97Wx0sde+ZuKkoM4= github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a h1:yDWHCSQ40h88yih2JAcL6Ls/kVkSE8GFACTGVnMPruw= diff --git a/migrations/13_add_export_tables_down.sql b/migrations/13_add_export_tables_down.sql new file mode 100644 index 0000000000000000000000000000000000000000..52260564f15e87e80a93066572afbfacaac517e2 --- /dev/null +++ b/migrations/13_add_export_tables_down.sql @@ -0,0 +1,3 @@ +DROP INDEX export_parts_index; +DROP TABLE export_parts; +DROP TABLE exports; diff --git a/migrations/13_add_export_tables_up.sql b/migrations/13_add_export_tables_up.sql new file mode 100644 index 0000000000000000000000000000000000000000..a4c02105172e2d8c29ff48671e262831c87dd3ba --- /dev/null +++ b/migrations/13_add_export_tables_up.sql @@ -0,0 +1,13 @@ +CREATE TABLE IF NOT EXISTS exports ( + export_id TEXT PRIMARY KEY NOT NULL, + entity TEXT NOT NULL +); +CREATE TABLE IF NOT EXISTS export_parts ( + export_id TEXT NOT NULL, + index INT NOT NULL, + size_bytes BIGINT NOT NULL, + file_name TEXT NOT NULL, + datastore_id TEXT NOT NULL, + location TEXT NOT NULL +); +CREATE UNIQUE INDEX IF NOT EXISTS export_parts_index ON export_parts (export_id, index); diff --git a/storage/datastore/ds_s3/s3_store.go b/storage/datastore/ds_s3/s3_store.go index 7c53ba1868df07e8fc136b40a1c5b5e4f88d1e81..64c7ba3f87f69f39d717374c3b21ab28c8dc1074 100644 --- a/storage/datastore/ds_s3/s3_store.go +++ b/storage/datastore/ds_s3/s3_store.go @@ -2,6 +2,7 @@ package ds_s3 import ( "context" + "fmt" "io" "io/ioutil" "os" @@ -64,6 +65,17 @@ func GetOrCreateS3Datastore(dsId string, conf config.DatastoreConfig) (*s3Datast return s3ds, nil } +func GetS3URL(datastoreId string, location string) (string, error) { + var store *s3Datastore + var ok bool + if store, ok = stores[datastoreId]; !ok { + return "", errors.New("s3 datastore not found") + } + + // HACK: Surely there's a better way... + return fmt.Sprintf("https://%s/%s/%s", store.conf.Options["endpoint"], store.bucket, location), nil +} + func (s *s3Datastore) EnsureBucketExists() error { found, err := s.client.BucketExists(s.bucket) if err != nil { diff --git a/storage/storage.go b/storage/storage.go index 3cfd462550f8760c7a502ba963dcd39737e2a930..b74908e0c3c61888e661af8defd5e3c91f7b5833 100644 --- a/storage/storage.go +++ b/storage/storage.go @@ -22,6 +22,7 @@ type repos struct { thumbnailStore *stores.ThumbnailStoreFactory urlStore *stores.UrlStoreFactory metadataStore *stores.MetadataStoreFactory + exportStore *stores.ExportStoreFactory } var dbInstance *Database @@ -79,6 +80,10 @@ func OpenDatabase(connectionString string, maxConns int, maxIdleConns int) error if d.repos.metadataStore, err = stores.InitMetadataStore(d.db); err != nil { return err } + logrus.Info("Setting up export DB store...") + if d.repos.exportStore, err = stores.InitExportStore(d.db); err != nil { + return err + } // Run some tasks that should always be done on startup if err = populateDatastores(d); err != nil { @@ -107,3 +112,7 @@ func (d *Database) GetUrlStore(ctx context.Context, log *logrus.Entry) *stores.U func (d *Database) GetMetadataStore(ctx context.Context, log *logrus.Entry) *stores.MetadataStore { return d.repos.metadataStore.Create(ctx, log) } + +func (d *Database) GetExportStore(ctx context.Context, log *logrus.Entry) *stores.ExportStore { + return d.repos.exportStore.Create(ctx, log) +} diff --git a/storage/stores/export_store.go b/storage/stores/export_store.go new file mode 100644 index 0000000000000000000000000000000000000000..87d094d6b9592190f03babf1ab508b9f2f705449 --- /dev/null +++ b/storage/stores/export_store.go @@ -0,0 +1,151 @@ +package stores + +import ( + "context" + "database/sql" + + "github.com/sirupsen/logrus" + "github.com/turt2live/matrix-media-repo/types" +) + +const insertExportMetadata = "INSERT INTO exports (export_id, entity) VALUES ($1, $2);" +const insertExportPart = "INSERT INTO export_parts (export_id, index, size_bytes, file_name, datastore_id, location) VALUES ($1, $2, $3, $4, $5, $6);" +const selectExportMetadata = "SELECT export_id, entity FROM exports WHERE export_id = $1;" +const selectExportParts = "SELECT export_id, index, size_bytes, file_name, datastore_id, location FROM export_parts WHERE export_id = $1;" +const selectExportPart = "SELECT export_id, index, size_bytes, file_name, datastore_id, location FROM export_parts WHERE export_id = $1 AND index = $2;" +const deleteExportParts = "DELETE FROM export_parts WHERE export_id = $1;" +const deleteExport = "DELETE FROM exports WHERE export_id = $1;" + +type exportStoreStatements struct { + insertExportMetadata *sql.Stmt + insertExportPart *sql.Stmt + selectExportMetadata *sql.Stmt + selectExportParts *sql.Stmt + selectExportPart *sql.Stmt + deleteExportParts *sql.Stmt + deleteExport *sql.Stmt +} + +type ExportStoreFactory struct { + sqlDb *sql.DB + stmts *exportStoreStatements +} + +type ExportStore struct { + factory *ExportStoreFactory // just for reference + ctx context.Context + log *logrus.Entry + statements *exportStoreStatements // copied from factory +} + +func InitExportStore(sqlDb *sql.DB) (*ExportStoreFactory, error) { + store := ExportStoreFactory{stmts: &exportStoreStatements{}} + var err error + + store.sqlDb = sqlDb + + if store.stmts.insertExportMetadata, err = store.sqlDb.Prepare(insertExportMetadata); err != nil { + return nil, err + } + if store.stmts.insertExportPart, err = store.sqlDb.Prepare(insertExportPart); err != nil { + return nil, err + } + if store.stmts.selectExportMetadata, err = store.sqlDb.Prepare(selectExportMetadata); err != nil { + return nil, err + } + if store.stmts.selectExportParts, err = store.sqlDb.Prepare(selectExportParts); err != nil { + return nil, err + } + if store.stmts.selectExportPart, err = store.sqlDb.Prepare(selectExportPart); err != nil { + return nil, err + } + if store.stmts.deleteExportParts, err = store.sqlDb.Prepare(deleteExportParts); err != nil { + return nil, err + } + if store.stmts.deleteExport, err = store.sqlDb.Prepare(deleteExport); err != nil { + return nil, err + } + + return &store, nil +} + +func (f *ExportStoreFactory) Create(ctx context.Context, entry *logrus.Entry) *ExportStore { + return &ExportStore{ + factory: f, + ctx: ctx, + log: entry, + statements: f.stmts, // we copy this intentionally + } +} + +func (s *ExportStore) InsertExport(exportId string, entity string) error { + _, err := s.statements.insertExportMetadata.ExecContext(s.ctx, exportId, entity) + return err +} + +func (s *ExportStore) InsertExportPart(exportId string, index int, size int64, name string, datastoreId string, location string) error { + _, err := s.statements.insertExportPart.ExecContext(s.ctx, exportId, index, size, name, datastoreId, location) + return err +} + +func (s *ExportStore) GetExportMetadata(exportId string) (*types.ExportMetadata, error) { + m := &types.ExportMetadata{} + err := s.statements.selectExportMetadata.QueryRowContext(s.ctx, exportId).Scan( + &m.ExportID, + &m.Entity, + ) + return m, err +} + +func (s *ExportStore) GetExportParts(exportId string) ([]*types.ExportPart, error) { + rows, err := s.statements.selectExportParts.QueryContext(s.ctx, exportId) + if err != nil { + return nil, err + } + + var results []*types.ExportPart + for rows.Next() { + obj := &types.ExportPart{} + err = rows.Scan( + &obj.ExportID, + &obj.Index, + &obj.SizeBytes, + &obj.FileName, + &obj.DatastoreID, + &obj.Location, + ) + if err != nil { + return nil, err + } + results = append(results, obj) + } + + return results, nil +} + +func (s *ExportStore) GetExportPart(exportId string, index int) (*types.ExportPart, error) { + m := &types.ExportPart{} + err := s.statements.selectExportPart.QueryRowContext(s.ctx, exportId, index).Scan( + &m.ExportID, + &m.Index, + &m.SizeBytes, + &m.FileName, + &m.DatastoreID, + &m.Location, + ) + return m, err +} + +func (s *ExportStore) DeleteExportAndParts(exportId string) error { + _, err := s.statements.deleteExportParts.ExecContext(s.ctx, exportId) + if err != nil { + return err + } + + _, err = s.statements.deleteExport.ExecContext(s.ctx, exportId) + if err != nil { + return err + } + + return nil +} diff --git a/templates/export_index.html b/templates/export_index.html new file mode 100644 index 0000000000000000000000000000000000000000..98fe9ebcc453f3d63a4dacb2e8474b592adb6093 --- /dev/null +++ b/templates/export_index.html @@ -0,0 +1,74 @@ +<!DOCTYPE html> +<html lang="en"> +<head> + <title>{{.Entity}} media export</title> + <style type="text/css"> + body, html { + margin: 0; + padding: 0; + background-color: #eaeaea; + color: #111; + font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; + } + .container { + width: 750px; + margin: 50px auto auto; + background-color: #fff; + border-radius: 5px; + padding: 20px; + border: 1px solid rgba(143,45,86,.2); + box-shadow: 0 20px 40px 20px rgba(206,222,235,.34); + } + table { + word-break: break-word; + border: solid 1px rgba(143,45,86,.4); + border-collapse: collapse; + border-spacing: 0; + } + table th { + background-color: rgba(143,45,86,.2); + border: solid 1px rgba(143,45,86,.4); + color: #6b3d53; + padding: 10px; + text-align: left; + } + table td { + border: solid 1px rgba(143,45,86,.4); + padding: 10px; + } + .muted { + display: block; + color: #999; + } + </style> +</head> +<body> +<div class="container"> + <h1>{{.Entity}} media export</h1> + <p> + Assuming you've extracted each and every part of your data, your files can be accessed here. + </p> + <table> + <thead> + <tr> + <th>File name</th> + <th>Size</th> + <th>Upload date</th> + </tr> + </thead> + <tbody> + {{range .Media}} + <tr> + <td> + <a href="{{.ArchivedName}}"><code>{{if eq .FileName ""}}<no name>{{else}}{{.FileName}}{{end}}</code></a> + <span class="muted"><code>mxc://{{.Origin}}/{{.MediaID}}</code></span> + </td> + <td>{{.SizeBytesHuman}}</td> + <td>{{.UploadDateHuman}}</td> + </tr> + {{end}} + </tbody> + </table> +</div> +</body> +</html> \ No newline at end of file diff --git a/templates/view_export.html b/templates/view_export.html new file mode 100644 index 0000000000000000000000000000000000000000..6c8ae79c61186cd3eb22b5cf6a0f40d10fda59fe --- /dev/null +++ b/templates/view_export.html @@ -0,0 +1,71 @@ +<!DOCTYPE html> +<html lang="en"> +<head> + <title>{{.Entity}} media export</title> + <style type="text/css"> + body, html { + margin: 0; + padding: 0; + background-color: #eaeaea; + color: #111; + font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; + } + .container { + width: 500px; + margin: 50px auto auto; + background-color: #fff; + border-radius: 5px; + padding: 20px; + border: 1px solid rgba(143,45,86,.2); + box-shadow: 0 20px 40px 20px rgba(206,222,235,.34); + } + </style> + <noscript> + <style type="text/css"> + #delete-option { + display: none; + } + </style> + </noscript> + <script type="text/javascript"> + <!-- + function deleteExport() { + var url = "/_matrix/media/unstable/admin/export/{{.ExportID}}/delete"; + var xhr = new XMLHttpRequest(); + xhr.open("DELETE", url, true); + xhr.onreadystatechange = function() { + if (this.readyState !== 4) return; + var element = document.getElementById("main"); + if (!element) return; + + if (this.status === 200) { + element.innerHTML = "<h1>{{.Entity}} media export</h1><p>Your export has been deleted.</p>"; + } else { + element.innerHTML = "<h1>{{.Entity}} media export</h1><p>There was a problem deleting your export. Please reload the page and try again.</p>"; + } + }; + xhr.send(); + } + //--> + </script> +</head> +<body> + <div class="container" id="main"> + <h1>{{.Entity}} media export</h1> + <p> + Your media can be downloaded in parts below. Each part is a portion of your data and can be extracted using + free software like <a href="https://www.7-zip.org/" target="_blank">7-Zip</a>. To see all of your data, download + and extract each part then open the <code>index.html</code> file. + </p> + <ul> + {{range .ExportParts}} + <li><a href="/_matrix/media/unstable/admin/export/{{.ExportID}}/part/{{.Index}}" download>{{.FileName}}</a> ({{.SizeBytesHuman}})</li> + {{end}} + </ul> + <p id="delete-option">Downloaded all your data? <a href="javascript:deleteExport()">Delete your export</a></p> + <noscript> + <p>To delete your export, please enable JavaScript</p> + </noscript> + </div> +</body> +</html> \ No newline at end of file diff --git a/templating/models.go b/templating/models.go new file mode 100644 index 0000000000000000000000000000000000000000..d047e0da869284be4dc669ebad5dc6fecb8aea39 --- /dev/null +++ b/templating/models.go @@ -0,0 +1,35 @@ +package templating + +type ViewExportPartModel struct { + ExportID string + Index int + SizeBytes int64 + SizeBytesHuman string + FileName string +} + +type ViewExportModel struct { + ExportID string + Entity string + ExportParts []*ViewExportPartModel +} + +type ExportIndexMediaModel struct { + ExportID string + ArchivedName string + FileName string + Origin string + MediaID string + SizeBytes int64 + SizeBytesHuman string + UploadTs int64 + UploadDateHuman string + Sha256Hash string + ContentType string +} + +type ExportIndexModel struct { + ExportID string + Entity string + Media []*ExportIndexMediaModel +} diff --git a/templating/templates.go b/templating/templates.go new file mode 100644 index 0000000000000000000000000000000000000000..508f80967501ff780b1954f78d297dbbe7a1cd10 --- /dev/null +++ b/templating/templates.go @@ -0,0 +1,45 @@ +package templating + +import ( + "fmt" + "html/template" + "path" + "sync" + + "github.com/turt2live/matrix-media-repo/common/config" +) + +type templates struct { + cached map[string]*template.Template +} + +var instance *templates +var singletonLock = &sync.Once{} + +func GetInstance() *templates { + if instance == nil { + singletonLock.Do(func() { + instance = &templates{ + cached: make(map[string]*template.Template), + } + }) + } + return instance +} + +func GetTemplate(name string) (*template.Template, error) { + i := GetInstance() + //if v, ok := i.cached[name]; ok { + // return v, nil + //} + + fname := fmt.Sprintf("%s.html", name) + tmplPath := path.Join(config.Runtime.TemplatesPath, fname) + t, err := template.New(fname).ParseFiles(tmplPath) + if err != nil { + return nil, err + } + + i.cached[name] = t + return t, nil +} diff --git a/types/exports.go b/types/exports.go new file mode 100644 index 0000000000000000000000000000000000000000..17006fb79b59de5dd7f6fcd64f79b2709b3296d4 --- /dev/null +++ b/types/exports.go @@ -0,0 +1,15 @@ +package types + +type ExportMetadata struct { + ExportID string + Entity string +} + +type ExportPart struct { + ExportID string + Index int + FileName string + SizeBytes int64 + DatastoreID string + Location string +} diff --git a/util/time.go b/util/time.go index e6c2d82f9620aa9a96a33aecc02cdceadfe8bfc5..f9b496da2e221ee87e8d3578540b3b0e1f2cbfe1 100644 --- a/util/time.go +++ b/util/time.go @@ -5,3 +5,7 @@ import "time" func NowMillis() int64 { return time.Now().UnixNano() / 1000000 } + +func FromMillis(m int64) time.Time { + return time.Unix(0, m*int64(time.Millisecond)) +}