From 1fd8e694c26a05069da7c660f1c4b95395bfea59 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Fri, 18 May 2018 23:06:08 -0400 Subject: use a go workspace --- Makefile | 7 ++ dedupe.go | 245 -------------------------------------------- go/.gitignore | 2 + go/src/cow-dedupe/dedupe.go | 245 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 254 insertions(+), 245 deletions(-) delete mode 100644 dedupe.go create mode 100644 go/.gitignore create mode 100644 go/src/cow-dedupe/dedupe.go diff --git a/Makefile b/Makefile index 9693ea4..fe3d2ca 100644 --- a/Makefile +++ b/Makefile @@ -10,6 +10,7 @@ CPPFLAGS += -O2 -D_FORTIFY_SOURCE=2 all: cow-dedupe-range all: cow-extent-map +all: cow-dedupe .PHONY: all %: src/%.o @@ -18,5 +19,11 @@ all: cow-extent-map cow-dedupe-range: lib/dedupe-range.o cow-extent-map: lib/extent-map.o lib/strextentflags.o +cow-dedupe: go/bin/cow-dedupe + cp $< $@ +go/bin/cow-dedupe: FORCE + GOPATH=$(CURDIR)/go go install $(@F) + .SECONDARY: .DELETE_ON_ERROR: +.PHONY: FORCE diff --git a/dedupe.go b/dedupe.go deleted file mode 100644 index f28d2f4..0000000 --- a/dedupe.go +++ /dev/null @@ -1,245 +0,0 @@ -package main - -import ( - "bufio" - "fmt" - "io" - "os" - "os/exec" - "path/filepath" - "runtime" - "strconv" - "strings" -) - -//#include -import "C" - -var arg_max = int(C.sysconf(C._SC_ARG_MAX)) - -func errhandle(err error) { - if err != nil { - panic(err) - } -} - -func findLikelyDups(paths []string) map[string][]string { - ret := map[string][]string{} - var err error - for i := range paths { - paths[i], err = filepath.Abs(paths[i]) - errhandle(err) - } - cmd := exec.Command("find", append(paths, "-type", "f", "-printf", "%s %p\\0")...) - stdout, err := cmd.StdoutPipe() - errhandle(err) - cmd.Stderr = os.Stderr - errhandle(cmd.Start()) - rd := bufio.NewReader(stdout) - for { - line, err := rd.ReadString('\x00') - if line == "" && err == io.EOF { - break - } - errhandle(err) - parts := strings.SplitN(strings.TrimSuffix(line, "\x00"), " ", 2) - if len(parts) != 2 { - panic("wut") - } - size := parts[0] - filename := parts[1] - basename := filepath.Base(filename) - key := size + " " + basename - ret[key] = append(ret[key], filename) - } - errhandle(cmd.Wait()) - for key := range ret { - if len(ret[key]) < 2 { - delete(ret, key) - } - } - return ret -} - -func getFiemaps(paths []string) map[string][]string { - ret := map[string][]string{} - fmt.Fprintf(os.Stderr, "Getting fiemaps for %d files...\n", len(paths)) - - cnt := 0 - for len(paths) > 0 { - _paths := paths - arg_len := 0 - for i := range _paths { - arg_len += len(_paths[i]) + 1 - if arg_len > arg_max/2 { - _paths = _paths[:i-1] - break - } - } - paths = paths[len(_paths):] - - cmd := exec.Command("./cow-extent-map", append([]string{"-m", "--"}, _paths...)...) - stdout, err := cmd.StdoutPipe() - errhandle(err) - cmd.Stderr = os.Stderr - errhandle(cmd.Start()) - rd := bufio.NewReader(stdout) - for { - filename, err := rd.ReadString('\x00') - if filename == "" && err == io.EOF { - break - } - filename = strings.TrimSuffix(filename, "\x00") - if !strings.HasPrefix(filename, "/") { - panic("ugly filename") - } - errhandle(err) - fiemap, err := rd.ReadString('\x00') - fiemap = strings.TrimSuffix(fiemap, "\x00") - if !(strings.HasPrefix(fiemap, "logical=") || fiemap == "") { - panic("ugly fiemap") - } - errhandle(err) - - ret[fiemap] = append(ret[fiemap], filename) - cnt++ - fmt.Fprintf(os.Stderr, "\r%d ", cnt) - } - errhandle(cmd.Wait()) - } - - fmt.Fprintf(os.Stderr, "\r...done \n") - return ret -} - -func getChecksums(paths []string) map[string][]string { - ret := map[string][]string{} - fmt.Fprintf(os.Stderr, "Generating checksums for %d files...\n", len(paths)) - - cnt := 0 - for len(paths) > 0 { - _paths := paths - arg_len := 0 - for i := range _paths { - arg_len += len(_paths[i]) + 1 - if arg_len > arg_max/2 { - _paths = _paths[:i-1] - break - } - } - paths = paths[len(_paths):] - - cmd := exec.Command("sha256sum", append([]string{"--"}, _paths...)...) - stdout, err := cmd.StdoutPipe() - errhandle(err) - cmd.Stderr = os.Stderr - errhandle(cmd.Start()) - rd := bufio.NewReader(stdout) - for { - line, err := rd.ReadString('\n') - if line == "" && err == io.EOF { - break - } - errhandle(err) - parts := strings.SplitN(strings.TrimSuffix(line, "\n"), " ", 2) - if len(parts) != 2 { - panic("wut") - } - checksum := parts[0] - filename := strings.TrimPrefix(parts[1], " ") - - ret[checksum] = append(ret[checksum], filename) - cnt++ - fmt.Fprintf(os.Stderr, "\r%d ", cnt) - } - errhandle(cmd.Wait()) - } - - fmt.Fprintf(os.Stderr, "\r...done \n") - return ret -} - -func main() { - // we have no parallelism, don't let syscalls fan-out weird on - // many-core systems - runtime.GOMAXPROCS(1) - - likely := findLikelyDups(os.Args[1:]) - - var flatLikely []string - for _, filenames := range likely { - flatLikely = append(flatLikely, filenames...) - } - - fiemap2filenames := getFiemaps(flatLikely) - - filename2fiemap := map[string]string{} - for fiemap, filenames := range fiemap2filenames { - for _, filename := range filenames { - if _, ok := filename2fiemap[filename]; ok { - panic("not ok") - } - filename2fiemap[filename] = fiemap - } - } - - spanningFiles := make([]string, len(fiemap2filenames)) - i := 0 - for _, filenames := range fiemap2filenames { - spanningFiles[i] = filenames[0] - i++ - } - - checksum2filenames := getChecksums(spanningFiles) - - checksum2fiemaps := map[string][]string{} - for checksum, filenames := range checksum2filenames { - for _, filename := range filenames { - checksum2fiemaps[checksum] = append(checksum2fiemaps[checksum], filename2fiemap[filename]) - } - } - for checksum, fiemaps := range checksum2fiemaps { - if len(fiemaps) < 2 { - delete(checksum2fiemaps, checksum) - } - } - - fmt.Fprintf(os.Stderr, "Deduplicating %d sets of files...\n", len(checksum2fiemaps)) - for checksum, fiemaps := range checksum2fiemaps { - // choose the fiemap with the fewest extents - minFiemap := fiemaps[0] - minFiemapLen := strings.Count(minFiemap, "\n") - for _, fiemap := range fiemaps { - fiemapLen := strings.Count(fiemap, "\n") - if fiemapLen < minFiemapLen { - minFiemap = fiemap - minFiemapLen = fiemapLen - } - } - srcFile := fiemap2filenames[minFiemap][0] - var dupFiles []string - for _, fiemap := range fiemaps { - if fiemap == minFiemap { - continue - } - dupFiles = append(dupFiles, fiemap2filenames[fiemap]...) - } - - stat, err := os.Stat(srcFile) - errhandle(err) - args := []string{ - "-r", "--", strconv.FormatInt(stat.Size(), 10), - srcFile, "0", - } - for _, dupFile := range dupFiles { - args = append(args, dupFile, "0") - } - - cmd := exec.Command("./cow-dedupe-range", args...) - cmd.Stdout = os.Stdout - cmd.Stderr = os.Stderr - fmt.Println("#", checksum) - fmt.Println(cmd.Args) - errhandle(cmd.Run()) - } -} diff --git a/go/.gitignore b/go/.gitignore new file mode 100644 index 0000000..4ce8181 --- /dev/null +++ b/go/.gitignore @@ -0,0 +1,2 @@ +/bin/ +/pkg/ diff --git a/go/src/cow-dedupe/dedupe.go b/go/src/cow-dedupe/dedupe.go new file mode 100644 index 0000000..f28d2f4 --- /dev/null +++ b/go/src/cow-dedupe/dedupe.go @@ -0,0 +1,245 @@ +package main + +import ( + "bufio" + "fmt" + "io" + "os" + "os/exec" + "path/filepath" + "runtime" + "strconv" + "strings" +) + +//#include +import "C" + +var arg_max = int(C.sysconf(C._SC_ARG_MAX)) + +func errhandle(err error) { + if err != nil { + panic(err) + } +} + +func findLikelyDups(paths []string) map[string][]string { + ret := map[string][]string{} + var err error + for i := range paths { + paths[i], err = filepath.Abs(paths[i]) + errhandle(err) + } + cmd := exec.Command("find", append(paths, "-type", "f", "-printf", "%s %p\\0")...) + stdout, err := cmd.StdoutPipe() + errhandle(err) + cmd.Stderr = os.Stderr + errhandle(cmd.Start()) + rd := bufio.NewReader(stdout) + for { + line, err := rd.ReadString('\x00') + if line == "" && err == io.EOF { + break + } + errhandle(err) + parts := strings.SplitN(strings.TrimSuffix(line, "\x00"), " ", 2) + if len(parts) != 2 { + panic("wut") + } + size := parts[0] + filename := parts[1] + basename := filepath.Base(filename) + key := size + " " + basename + ret[key] = append(ret[key], filename) + } + errhandle(cmd.Wait()) + for key := range ret { + if len(ret[key]) < 2 { + delete(ret, key) + } + } + return ret +} + +func getFiemaps(paths []string) map[string][]string { + ret := map[string][]string{} + fmt.Fprintf(os.Stderr, "Getting fiemaps for %d files...\n", len(paths)) + + cnt := 0 + for len(paths) > 0 { + _paths := paths + arg_len := 0 + for i := range _paths { + arg_len += len(_paths[i]) + 1 + if arg_len > arg_max/2 { + _paths = _paths[:i-1] + break + } + } + paths = paths[len(_paths):] + + cmd := exec.Command("./cow-extent-map", append([]string{"-m", "--"}, _paths...)...) + stdout, err := cmd.StdoutPipe() + errhandle(err) + cmd.Stderr = os.Stderr + errhandle(cmd.Start()) + rd := bufio.NewReader(stdout) + for { + filename, err := rd.ReadString('\x00') + if filename == "" && err == io.EOF { + break + } + filename = strings.TrimSuffix(filename, "\x00") + if !strings.HasPrefix(filename, "/") { + panic("ugly filename") + } + errhandle(err) + fiemap, err := rd.ReadString('\x00') + fiemap = strings.TrimSuffix(fiemap, "\x00") + if !(strings.HasPrefix(fiemap, "logical=") || fiemap == "") { + panic("ugly fiemap") + } + errhandle(err) + + ret[fiemap] = append(ret[fiemap], filename) + cnt++ + fmt.Fprintf(os.Stderr, "\r%d ", cnt) + } + errhandle(cmd.Wait()) + } + + fmt.Fprintf(os.Stderr, "\r...done \n") + return ret +} + +func getChecksums(paths []string) map[string][]string { + ret := map[string][]string{} + fmt.Fprintf(os.Stderr, "Generating checksums for %d files...\n", len(paths)) + + cnt := 0 + for len(paths) > 0 { + _paths := paths + arg_len := 0 + for i := range _paths { + arg_len += len(_paths[i]) + 1 + if arg_len > arg_max/2 { + _paths = _paths[:i-1] + break + } + } + paths = paths[len(_paths):] + + cmd := exec.Command("sha256sum", append([]string{"--"}, _paths...)...) + stdout, err := cmd.StdoutPipe() + errhandle(err) + cmd.Stderr = os.Stderr + errhandle(cmd.Start()) + rd := bufio.NewReader(stdout) + for { + line, err := rd.ReadString('\n') + if line == "" && err == io.EOF { + break + } + errhandle(err) + parts := strings.SplitN(strings.TrimSuffix(line, "\n"), " ", 2) + if len(parts) != 2 { + panic("wut") + } + checksum := parts[0] + filename := strings.TrimPrefix(parts[1], " ") + + ret[checksum] = append(ret[checksum], filename) + cnt++ + fmt.Fprintf(os.Stderr, "\r%d ", cnt) + } + errhandle(cmd.Wait()) + } + + fmt.Fprintf(os.Stderr, "\r...done \n") + return ret +} + +func main() { + // we have no parallelism, don't let syscalls fan-out weird on + // many-core systems + runtime.GOMAXPROCS(1) + + likely := findLikelyDups(os.Args[1:]) + + var flatLikely []string + for _, filenames := range likely { + flatLikely = append(flatLikely, filenames...) + } + + fiemap2filenames := getFiemaps(flatLikely) + + filename2fiemap := map[string]string{} + for fiemap, filenames := range fiemap2filenames { + for _, filename := range filenames { + if _, ok := filename2fiemap[filename]; ok { + panic("not ok") + } + filename2fiemap[filename] = fiemap + } + } + + spanningFiles := make([]string, len(fiemap2filenames)) + i := 0 + for _, filenames := range fiemap2filenames { + spanningFiles[i] = filenames[0] + i++ + } + + checksum2filenames := getChecksums(spanningFiles) + + checksum2fiemaps := map[string][]string{} + for checksum, filenames := range checksum2filenames { + for _, filename := range filenames { + checksum2fiemaps[checksum] = append(checksum2fiemaps[checksum], filename2fiemap[filename]) + } + } + for checksum, fiemaps := range checksum2fiemaps { + if len(fiemaps) < 2 { + delete(checksum2fiemaps, checksum) + } + } + + fmt.Fprintf(os.Stderr, "Deduplicating %d sets of files...\n", len(checksum2fiemaps)) + for checksum, fiemaps := range checksum2fiemaps { + // choose the fiemap with the fewest extents + minFiemap := fiemaps[0] + minFiemapLen := strings.Count(minFiemap, "\n") + for _, fiemap := range fiemaps { + fiemapLen := strings.Count(fiemap, "\n") + if fiemapLen < minFiemapLen { + minFiemap = fiemap + minFiemapLen = fiemapLen + } + } + srcFile := fiemap2filenames[minFiemap][0] + var dupFiles []string + for _, fiemap := range fiemaps { + if fiemap == minFiemap { + continue + } + dupFiles = append(dupFiles, fiemap2filenames[fiemap]...) + } + + stat, err := os.Stat(srcFile) + errhandle(err) + args := []string{ + "-r", "--", strconv.FormatInt(stat.Size(), 10), + srcFile, "0", + } + for _, dupFile := range dupFiles { + args = append(args, dupFile, "0") + } + + cmd := exec.Command("./cow-dedupe-range", args...) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + fmt.Println("#", checksum) + fmt.Println(cmd.Args) + errhandle(cmd.Run()) + } +} -- cgit v1.2.3-2-g168b