From 213675f5dabca3632456b64449d314765364c26b Mon Sep 17 00:00:00 2001
From: Travis Ralston <travpc@gmail.com>
Date: Sat, 4 Jan 2020 17:46:39 -0700
Subject: [PATCH] Add binaries for easier exports/imports

Fixes https://github.com/turt2live/matrix-media-repo/issues/215
---
 .circleci/config.yml                          |   6 +
 .gitignore                                    |   1 +
 CHANGELOG.md                                  |   1 +
 README.md                                     |  40 +++++
 appveyor.yml                                  |   4 +
 cmd/gdpr_export/main.go                       | 138 ++++++++++++++++++
 cmd/gdpr_import/main.go                       | 110 ++++++++++++++
 .../data_controller/import_controller.go      |   6 +-
 docs/admin.md                                 |   4 +
 9 files changed, 306 insertions(+), 4 deletions(-)
 create mode 100644 cmd/gdpr_export/main.go
 create mode 100644 cmd/gdpr_import/main.go

diff --git a/.circleci/config.yml b/.circleci/config.yml
index a0edf3f5..1e5e24fc 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -17,6 +17,12 @@ jobs:
       - store_artifacts:
           path: bin/import_synapse
           destination: import_synapse
+      - store_artifacts:
+          path: bin/gdpr_export
+          destination: gdpr_export
+      - store_artifacts:
+          path: bin/gdpr_import
+          destination: gdpr_import
 workflows:
   version: 2
   build_and_test:
diff --git a/.gitignore b/.gitignore
index 17100f70..c0eb9bc2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,7 @@
 /logs
 /vendor
 /config
+/gdpr-data
 
 # Generated files
 assets.bin.go
diff --git a/CHANGELOG.md b/CHANGELOG.md
index bc2c80af..3feb999a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 ### Added
 
 * Compile assets (templates and migrations) into the binary for ease of deployment.
+* Added binaries to make exports and imports easier.
 
 ### Fixed
 
diff --git a/README.md b/README.md
index c7d209dc..888a8cd6 100644
--- a/README.md
+++ b/README.md
@@ -140,3 +140,43 @@ release though if you want to avoid building it yourself.
     Assuming the media repository, postgres database, and synapse are all on the same host, the command to run would look something like: `bin/import_synapse -serverName myserver.com -dbUsername my_database_user -dbName synapse`
 4. Wait for the import to complete. The script will automatically deduplicate media.
 5. Point traffic to the media repository.
+
+## Export and import user data
+
+The admin API for this is specified in [docs/admin.md](./docs/admin.md), though they can be difficult to use for scripts.
+The `bin/gdpr_export` and `bin/gdpr_import` binaries do the process for you, and do so in memory but against the real
+media repo database and datastores - this moves the resource intensiveness to the binary you're running instead of the
+media repo instance, but still makes reads and writes to your database and datastores. For example, when exporting a 
+user's data the binary will pull all the data locally and write it to disk for you, but during that process the user's
+export is accessible via the main media repo too. The export is deleted if the binary is successful at exporting the 
+data.
+
+**Note**: Imports done through this method can affect other homeservers! For example, a user's data export could contain
+an entry for a homeserver other than their own, which the media repo will happily import. Always validate the manifest
+of an import before running it!
+
+Ensuring you have your media repo config available, here's the help for each binary:
+
+```
+Usage of gdpr_export:
+  -config string
+        The path to the configuration (default "media-repo.yaml")
+  -destination string
+        The directory for where export files should be placed (default "./gdpr-data")
+  -entity string
+        The user ID or server name to export
+  -migrations string
+        The absolute path for the migrations folder (default "./migrations")
+  -templates string
+        The absolute path for the templates folder (default "./templates")
+```
+
+```
+Usage of gdpr_import.exe:
+  -config string
+        The path to the configuration (default "media-repo.yaml")
+  -directory string
+        The directory for where the entity's exported files are (default "./gdpr-data")
+  -migrations string
+        The absolute path for the migrations folder (default "./migrations")
+```
diff --git a/appveyor.yml b/appveyor.yml
index e25502bd..4d169448 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -28,5 +28,9 @@ artifacts:
     name: media_repo.exe
   - path: bin/import_synapse.exe
     name: import_synapse.exe
+  - path: bin/gdpr_export.exe
+    name: gdpr_export.exe
+  - path: bin/gdpr_import.exe
+    name: gdpr_import.exe
 
 test_script: [] # https://github.com/turt2live/matrix-media-repo/issues/40
diff --git a/cmd/gdpr_export/main.go b/cmd/gdpr_export/main.go
new file mode 100644
index 00000000..edfaeabf
--- /dev/null
+++ b/cmd/gdpr_export/main.go
@@ -0,0 +1,138 @@
+package main
+
+import (
+	"flag"
+	"io"
+	"os"
+	"path"
+	"time"
+
+	"github.com/sirupsen/logrus"
+	"github.com/turt2live/matrix-media-repo/common/assets"
+	"github.com/turt2live/matrix-media-repo/common/config"
+	"github.com/turt2live/matrix-media-repo/common/logging"
+	"github.com/turt2live/matrix-media-repo/common/rcontext"
+	"github.com/turt2live/matrix-media-repo/common/runtime"
+	"github.com/turt2live/matrix-media-repo/controllers/data_controller"
+	"github.com/turt2live/matrix-media-repo/storage"
+	"github.com/turt2live/matrix-media-repo/storage/datastore"
+	"github.com/turt2live/matrix-media-repo/types"
+)
+
+func main() {
+	configPath := flag.String("config", "media-repo.yaml", "The path to the configuration")
+	migrationsPath := flag.String("migrations", config.DefaultMigrationsPath, "The absolute path for the migrations folder")
+	templatesPath := flag.String("templates", config.DefaultTemplatesPath, "The absolute path for the templates folder")
+	entity := flag.String("entity", "", "The user ID or server name to export")
+	destination := flag.String("destination", "./gdpr-data", "The directory for where export files should be placed")
+	flag.Parse()
+
+	if *entity == "" {
+		flag.Usage()
+		os.Exit(1)
+		return
+	}
+
+	config.Path = *configPath
+	assets.SetupTemplatesAndMigrations(*migrationsPath, *templatesPath)
+
+	var err error
+	err = logging.Setup(config.Get().General.LogDirectory)
+	if err != nil {
+		panic(err)
+	}
+
+	logrus.Info("Starting up...")
+	runtime.RunStartupSequence()
+
+	logrus.Info("Starting export...")
+	ctx := rcontext.Initial().LogWithFields(logrus.Fields{"flagEntity": *entity})
+	var task *types.BackgroundTask
+	var exportId string
+	if (*entity)[0] == '@' {
+		task, exportId, err = data_controller.StartUserExport(*entity, true, true, ctx)
+	} else {
+		task, exportId, err = data_controller.StartServerExport(*entity, true, true, ctx)
+	}
+
+	if err != nil {
+		panic(err)
+	}
+
+	logrus.Info("Waiting for export to complete")
+	waitChan := make(chan bool)
+	go func() {
+		// Initial sleep to let the caches fill
+		time.Sleep(1 * time.Second)
+
+		ctx := rcontext.Initial().LogWithFields(logrus.Fields{"flagEntity": *entity, "async": true})
+		db := storage.GetDatabase().GetMetadataStore(ctx)
+		for true {
+			ctx.Log.Info("Checking if task is complete")
+
+			task, err := db.GetBackgroundTask(task.ID)
+			if err != nil {
+				logrus.Error(err)
+			} else if task.EndTs > 0 {
+				waitChan<-true
+				return
+			}
+
+			time.Sleep(1 * time.Second)
+		}
+	}()
+	<-waitChan
+
+	logrus.Info("Export finished, dumping files")
+	exportDb := storage.GetDatabase().GetExportStore(ctx)
+	parts, err := exportDb.GetExportParts(exportId)
+	if err != nil {
+		panic(err)
+	}
+
+	// Create directory if not exists
+	_ = os.MkdirAll(*destination, os.ModePerm)
+
+	for _, p := range parts {
+		s, err := datastore.DownloadStream(ctx, p.DatastoreID, p.Location)
+		if err != nil {
+			panic(err)
+		}
+
+		fname := path.Join(*destination, p.FileName)
+		logrus.Info("Writing ", fname)
+		f, err := os.Create(fname)
+		if err != nil {
+			panic(err)
+		}
+		_, err = io.Copy(f, s)
+		if err != nil {
+			panic(err)
+		}
+		_ = f.Close()
+		_ = s.Close()
+	}
+
+	logrus.Info("Deleting export now that it has been dumped")
+	for _, p := range parts {
+		logrus.Info("Finding datastore for ", p.FileName, " / ", p.DatastoreID)
+		ds, err := datastore.LocateDatastore(ctx, p.DatastoreID)
+		if err != nil {
+			panic(err)
+		}
+
+		logrus.Info("Deleting object ", p.Location)
+		err = ds.DeleteObject(p.Location)
+		if err != nil {
+			panic(err)
+		}
+	}
+
+	logrus.Info("Purging export from database")
+	err = exportDb.DeleteExportAndParts(exportId)
+	if err != nil {
+		panic(err)
+	}
+
+	logrus.Infof("Export complete! Files for %s should be in %s", *entity, *destination)
+}
diff --git a/cmd/gdpr_import/main.go b/cmd/gdpr_import/main.go
new file mode 100644
index 00000000..e46f0a14
--- /dev/null
+++ b/cmd/gdpr_import/main.go
@@ -0,0 +1,110 @@
+package main
+
+import (
+	"flag"
+	"io/ioutil"
+	"os"
+	"path"
+	"time"
+
+	"github.com/sirupsen/logrus"
+	"github.com/turt2live/matrix-media-repo/common/assets"
+	"github.com/turt2live/matrix-media-repo/common/config"
+	"github.com/turt2live/matrix-media-repo/common/logging"
+	"github.com/turt2live/matrix-media-repo/common/rcontext"
+	"github.com/turt2live/matrix-media-repo/common/runtime"
+	"github.com/turt2live/matrix-media-repo/controllers/data_controller"
+	"github.com/turt2live/matrix-media-repo/storage"
+)
+
+func main() {
+	configPath := flag.String("config", "media-repo.yaml", "The path to the configuration")
+	migrationsPath := flag.String("migrations", config.DefaultMigrationsPath, "The absolute path for the migrations folder")
+	filesDir := flag.String("directory", "./gdpr-data", "The directory for where the entity's exported files are")
+	flag.Parse()
+
+	config.Path = *configPath
+	assets.SetupTemplatesAndMigrations(*migrationsPath, "")
+
+	var err error
+	err = logging.Setup(config.Get().General.LogDirectory)
+	if err != nil {
+		panic(err)
+	}
+
+	logrus.Info("Starting up...")
+	runtime.RunStartupSequence()
+
+	logrus.Info("Discovering files...")
+	fileInfos, err := ioutil.ReadDir(*filesDir)
+	if err != nil {
+		panic(err)
+	}
+	files := make([]string, 0)
+	for _, f := range fileInfos {
+		files = append(files, path.Join(*filesDir, f.Name()))
+	}
+
+	logrus.Info("Starting import...")
+	ctx := rcontext.Initial().LogWithFields(logrus.Fields{"flagDir": *filesDir})
+
+	f, err := os.Open(files[0])
+	if err != nil {
+		panic(err)
+	}
+	handles := make([]*os.File, 0)
+	handles = append(handles, f)
+	task, importId, err := data_controller.StartImport(f, ctx)
+	if err != nil {
+		panic(err)
+	}
+
+	logrus.Info("Appending all other files to import")
+	for i, fname := range files {
+		if i == 0 {
+			continue // already imported
+		}
+
+		logrus.Info("Appending ", fname)
+		f, err := os.Open(fname)
+		if err != nil {
+			panic(err)
+		}
+		handles = append(handles, f)
+		err = data_controller.AppendToImport(importId, f)
+		if err != nil {
+			panic(err)
+		}
+	}
+
+	logrus.Info("Waiting for import to complete")
+	waitChan := make(chan bool)
+	go func() {
+		// Initial sleep to let the caches fill
+		time.Sleep(1 * time.Second)
+
+		ctx := rcontext.Initial().LogWithFields(logrus.Fields{"async": true})
+		db := storage.GetDatabase().GetMetadataStore(ctx)
+		for true {
+			ctx.Log.Info("Checking if task is complete")
+
+			task, err := db.GetBackgroundTask(task.ID)
+			if err != nil {
+				logrus.Error(err)
+			} else if task.EndTs > 0 {
+				waitChan<-true
+				return
+			}
+
+			time.Sleep(1 * time.Second)
+		}
+	}()
+	<-waitChan
+
+	logrus.Info("Import finished, cleaning up")
+	for _, h := range handles {
+		h.Close()
+	}
+
+	logrus.Infof("Import complete!")
+}
diff --git a/controllers/data_controller/import_controller.go b/controllers/data_controller/import_controller.go
index dd7d302d..5dc24925 100644
--- a/controllers/data_controller/import_controller.go
+++ b/controllers/data_controller/import_controller.go
@@ -12,7 +12,6 @@ import (
 	"net/http"
 	"sync"
 
-	"github.com/prometheus/common/log"
 	"github.com/turt2live/matrix-media-repo/common"
 	"github.com/turt2live/matrix-media-repo/common/rcontext"
 	"github.com/turt2live/matrix-media-repo/controllers/upload_controller"
@@ -159,7 +158,6 @@ func doImport(updateChannel chan *importUpdate, taskId int, importId string, ctx
 			fileMap[name] = fileBytes
 		}
 
-		// TODO: Search for a manifest and import a bunch of files
 		var manifestBuf *bytes.Buffer
 		var ok bool
 		if manifestBuf, ok = fileMap["manifest.json"]; !ok {
@@ -245,7 +243,7 @@ func doImport(updateChannel chan *importUpdate, taskId int, importId string, ctx
 				ctx.Log.Infof("Seeing if a datastore for %s/%s exists", endpoint, bucket)
 				datastores, err := datastore.GetAvailableDatastores(ctx)
 				if err != nil {
-					log.Errorf("Error locating datastore: %s", err.Error())
+					ctx.Log.Errorf("Error locating datastore: %s", err.Error())
 					continue
 				}
 				imported := false
@@ -323,7 +321,7 @@ func doImport(updateChannel chan *importUpdate, taskId int, importId string, ctx
 				continue
 			}
 
-			log.Info("Counting file as imported")
+			ctx.Log.Info("Counting file as imported")
 			imported[mxc] = true
 		}
 
diff --git a/docs/admin.md b/docs/admin.md
index a00bf477..f43a4d40 100644
--- a/docs/admin.md
+++ b/docs/admin.md
@@ -410,6 +410,10 @@ Once an export has been completed it can be imported back into the media repo. F
 
 **Note**: Only repository administrators can perform imports, regardless of who they are for.
 
+**Note**: Imports done through this method can affect other homeservers! For example, a user's data export could contain
+an entry for a homeserver other than their own, which the media repo will happily import. Always validate the manifest
+of an import before running it!
+
 URL: `POST /_matrix/media/unstable/admin/import`
 
 The request body is the bytes of the first archive (eg: `TravisR-part-1.tgz` in the above examples).
-- 
GitLab