|
| 1 | +#!/usr/bin/env escript |
| 2 | + |
| 3 | +-include_lib("kernel/include/file.hrl"). |
| 4 | + |
| 5 | +-import(lists, [reverse/1]). |
| 6 | + |
| 7 | +print_usage() -> |
| 8 | + io:format("./find_duplicate_jpegs~n"). |
| 9 | + |
| 10 | +main([]) -> |
| 11 | + Files = files(<<"/">>, "*\\.jpg", true), |
| 12 | + |
| 13 | + % Map files to tuples of hash and list of filename |
| 14 | + HashesAndFilenames = lists:foldl(fun(Filename, Acc) -> |
| 15 | + {ok, Data} = file:read_file(Filename), |
| 16 | + Hash = erlang:md5(Data), |
| 17 | + case proplists:lookup(Hash, Acc) of |
| 18 | + none -> |
| 19 | + [{Hash, [Filename]}|Acc]; |
| 20 | + {Hash, Filenames} -> |
| 21 | + lists:keyreplace(Hash, 1, Acc, {Hash, [Filename|Filenames]}) |
| 22 | + end |
| 23 | + end, [], Files), |
| 24 | + |
| 25 | + % Find duplicate hashes |
| 26 | + DuplicateHashes = lists:filter(fun |
| 27 | + ({_Hash, Filenames}) when length(Filenames) > 1 -> true; |
| 28 | + (_) -> false |
| 29 | + |
| 30 | + end, HashesAndFilenames), |
| 31 | + |
| 32 | + % Print duplicate file names |
| 33 | + lists:foreach(fun({Hash, Filenames}) -> |
| 34 | + Arguments = [hex_string(Hash), lists:join("\n", Filenames)], |
| 35 | + io:format("Duplicate files with content hashed as ~s:~n~s~n~n", Arguments) |
| 36 | + end, DuplicateHashes), |
| 37 | + halt(0); |
| 38 | +main(_) -> |
| 39 | + % Print usage message if invalid number of arguments is passed in |
| 40 | + print_usage(), |
| 41 | + halt(1). |
| 42 | + |
| 43 | +% From https://stackoverflow.com/questions/3768197/erlang-ioformatting-a-binary-to-hex |
| 44 | +% We need to format each byte in the hex binary as a set of two ascii |
| 45 | +% characters in order to construct an printable string of characters. |
| 46 | +hex_string(HexBin) -> |
| 47 | + [io_lib:format("~2.16.0B",[X]) || <<X:8>> <= HexBin]. |
| 48 | + |
| 49 | +% Code from the lib_find module |
| 50 | +%% --- |
| 51 | +%% Excerpted from "Programming Erlang, Second Edition", |
| 52 | +%% published by The Pragmatic Bookshelf. |
| 53 | +%% Copyrights apply to this code. It may not be used to create training material, |
| 54 | +%% courses, books, articles, and the like. Contact us if you are in doubt. |
| 55 | +%% We make no guarantees that this code is fit for any purpose. |
| 56 | +%% Visit http://www.pragmaticprogrammer.com/titles/jaerlang2 for more book information. |
| 57 | +%%--- |
| 58 | + |
| 59 | +files(Dir, Re, Flag) -> |
| 60 | + Re1 = xmerl_regexp:sh_to_awk(Re), |
| 61 | + reverse(files(Dir, Re1, Flag, fun(File, Acc) ->[File|Acc] end, [])). |
| 62 | + |
| 63 | +files(Dir, Reg, Recursive, Fun, Acc) -> |
| 64 | + case file:list_dir(Dir) of |
| 65 | + {ok, Files} -> find_files(Files, Dir, Reg, Recursive, Fun, Acc); |
| 66 | + {error, _} -> Acc |
| 67 | + end. |
| 68 | + |
| 69 | +find_files([File|T], Dir, Reg, Recursive, Fun, Acc0) -> |
| 70 | + FullName = filename:join([Dir,File]), |
| 71 | + case file_type(FullName) of |
| 72 | + regular -> |
| 73 | + case re:run(FullName, Reg, [{capture,none}]) of |
| 74 | + match -> |
| 75 | + Acc = Fun(FullName, Acc0), |
| 76 | + find_files(T, Dir, Reg, Recursive, Fun, Acc); |
| 77 | + nomatch -> |
| 78 | + find_files(T, Dir, Reg, Recursive, Fun, Acc0) |
| 79 | + end; |
| 80 | + directory -> |
| 81 | + case Recursive of |
| 82 | + true -> |
| 83 | + Acc1 = files(FullName, Reg, Recursive, Fun, Acc0), |
| 84 | + find_files(T, Dir, Reg, Recursive, Fun, Acc1); |
| 85 | + false -> |
| 86 | + find_files(T, Dir, Reg, Recursive, Fun, Acc0) |
| 87 | + end; |
| 88 | + error -> |
| 89 | + find_files(T, Dir, Reg, Recursive, Fun, Acc0) |
| 90 | + end; |
| 91 | +find_files([], _, _, _, _, A) -> |
| 92 | + A. |
| 93 | + |
| 94 | +file_type(File) -> |
| 95 | + case file:read_file_info(File) of |
| 96 | + {ok, Facts} -> |
| 97 | + case Facts#file_info.type of |
| 98 | + regular -> regular; |
| 99 | + directory -> directory; |
| 100 | + _ -> error |
| 101 | + end; |
| 102 | + _ -> |
| 103 | + error |
| 104 | + end. |
0 commit comments