Commit 86c55efd authored by Leo Le Bouter's avatar Leo Le Bouter

Use MsgPack instead of JSON, add command line arguments + bug fixes

* Convert stat_result to proper dictionary so that field names are
  retained after serialization

* Add ability to ignore directories through command line arguments,
  explicitly add "ignored" field on ignored directories

It was decided that JSON was not a suitable format because bytes
serialization support is lacking. MsgPack supports it and is more
efficient, also it is the internal serialization format for Fluentd
which we will most probably use for ingesting data in a central
place.
parent 02a190aa
...@@ -8,8 +8,7 @@ import traceback ...@@ -8,8 +8,7 @@ import traceback
import hashlib import hashlib
import io import io
import multiprocessing import multiprocessing
import codecs from msgpack import dump
from json import JSONEncoder
import psutil import psutil
import posix1e # pylibacl import posix1e # pylibacl
...@@ -39,7 +38,28 @@ def compute_hashes(entry_path): ...@@ -39,7 +38,28 @@ def compute_hashes(entry_path):
"sha512": sha512.hexdigest()} "sha512": sha512.hexdigest()}
def construct_fs_tree(mp_pool=None, mp_tasks=[], cur_dict=None, path="/", dev_whitelist=None): def stat_result_to_dict(stat_result):
return {
"st_mode": stat_result.st_mode if hasattr(stat_result, "st_mode") else None,
"st_ino": stat_result.st_ino if hasattr(stat_result, "st_ino") else None,
"st_dev": stat_result.st_dev if hasattr(stat_result, "st_dev") else None,
"st_nlink": stat_result.st_nlink if hasattr(stat_result, "st_nlink") else None,
"st_uid": stat_result.st_uid if hasattr(stat_result, "st_uid") else None,
"st_gid": stat_result.st_gid if hasattr(stat_result, "st_gid") else None,
"st_size": stat_result.st_size if hasattr(stat_result, "st_size") else None,
"st_atime": stat_result.st_atime if hasattr(stat_result, "st_atime") else None,
"st_mtime": stat_result.st_mtime if hasattr(stat_result, "st_mtime") else None,
"st_ctime": stat_result.st_ctime if hasattr(stat_result, "st_ctime") else None,
"st_blocks": stat_result.st_blocks if hasattr(stat_result, "st_blocks") else None,
"st_blksize": stat_result.st_blksize if hasattr(stat_result, "st_blksize") else None,
"st_rdev": stat_result.st_rdev if hasattr(stat_result, "st_rdev") else None,
"st_flags": stat_result.st_flags if hasattr(stat_result, "st_flags") else None,
"st_gen": stat_result.st_gen if hasattr(stat_result, "st_gen") else None,
"st_birthtime": stat_result.st_birthtime if hasattr(stat_result, "st_birthtime") else None,
}
def construct_fs_tree(mp_pool=None, mp_tasks=[], cur_dict=None, path="/", dev_whitelist=None, ignored_dirs=[]):
is_first_call = False is_first_call = False
if mp_pool == None: if mp_pool == None:
...@@ -47,12 +67,18 @@ def construct_fs_tree(mp_pool=None, mp_tasks=[], cur_dict=None, path="/", dev_wh ...@@ -47,12 +67,18 @@ def construct_fs_tree(mp_pool=None, mp_tasks=[], cur_dict=None, path="/", dev_wh
mp_pool = multiprocessing.Pool() mp_pool = multiprocessing.Pool()
if cur_dict == None: if cur_dict == None:
cur_dict = {"stat": os.stat(path, follow_symlinks=False), cur_dict = {"stat": stat_result_to_dict(os.stat(path, follow_symlinks=False)),
"childs": dict()} "childs": dict()}
if dev_whitelist != None: if dev_whitelist != None:
path_stat = cur_dict["stat"] path_stat = cur_dict["stat"]
if not path_stat.st_dev in dev_whitelist: if "st_dev" in path_stat:
if not path_stat["st_dev"] in dev_whitelist:
return cur_dict
for dir in ignored_dirs:
if path.startswith(dir):
cur_dict["ignored"] = True
return cur_dict return cur_dict
try: try:
...@@ -64,26 +90,24 @@ def construct_fs_tree(mp_pool=None, mp_tasks=[], cur_dict=None, path="/", dev_wh ...@@ -64,26 +90,24 @@ def construct_fs_tree(mp_pool=None, mp_tasks=[], cur_dict=None, path="/", dev_wh
try: try:
entry_stat = os.stat(entry_path, follow_symlinks=False) entry_stat = os.stat(entry_path, follow_symlinks=False)
except Exception: except Exception:
traceback.print_exc() traceback.print_exc()
entry_stat = None entry_stat = None
cur_dict["childs"][entry_name] = {"stat": entry_stat, cur_dict["childs"][entry_name] = {"stat": stat_result_to_dict(entry_stat),
"childs": dict()} "childs": dict()}
try: try:
cur_dict["childs"][entry_name]["xattrs"] = dict() cur_dict["childs"][entry_name]["xattrs"] = dict()
for k in os.listxattr(entry_path, follow_symlinks=False): for k in os.listxattr(entry_path, follow_symlinks=False):
cur_dict["childs"][entry_name]["xattrs"][k] = codecs.decode( cur_dict["childs"][entry_name]["xattrs"][k] = os.getxattr(
os.getxattr(entry_path, k, follow_symlinks=False), "utf-8") entry_path, k, follow_symlinks=False)
except Exception: except Exception:
traceback.print_exc() traceback.print_exc()
try: try:
cur_dict["childs"][entry_name]["posix_acls"] = codecs.decode(posix1e.ACL(file=entry_path) cur_dict["childs"][entry_name]["posix_acls"] = posix1e.ACL(
.to_any_text(options=posix1e.TEXT_ALL_EFFECTIVE), file=entry_path).to_any_text(options=posix1e.TEXT_ALL_EFFECTIVE)
"utf-8")
except Exception: except Exception:
traceback.print_exc() traceback.print_exc()
...@@ -120,7 +144,14 @@ def construct_fs_tree(mp_pool=None, mp_tasks=[], cur_dict=None, path="/", dev_wh ...@@ -120,7 +144,14 @@ def construct_fs_tree(mp_pool=None, mp_tasks=[], cur_dict=None, path="/", dev_wh
return cur_dict return cur_dict
def main(argv): def main():
parser = argparse.ArgumentParser(
description="Collect and report metadata about a system")
parser.add_argument("start_directory", type=str, default="/")
parser.add_argument("--ignored-dirs", type=str, nargs="+", default=[])
args = parser.parse_args()
parts = psutil.disk_partitions(all=False) parts = psutil.disk_partitions(all=False)
dev_whitelist = list() dev_whitelist = list()
...@@ -128,10 +159,11 @@ def main(argv): ...@@ -128,10 +159,11 @@ def main(argv):
dev_whitelist.append( dev_whitelist.append(
os.stat(part.mountpoint, follow_symlinks=False).st_dev) os.stat(part.mountpoint, follow_symlinks=False).st_dev)
tree = construct_fs_tree(path='/', dev_whitelist=dev_whitelist) tree = construct_fs_tree(path=args.start_directory,
dev_whitelist=dev_whitelist, ignored_dirs=args.ignored_dirs)
print(JSONEncoder(separators=(',', ':')).encode(tree)) dump(tree, sys.stdout.buffer)
if __name__ == "__main__": if __name__ == "__main__":
main(sys.argv) main()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment