J’ai voulu faire un premier programme afin de découvrir l’API Elasticsearch, comme base d’information j’ai pris mes emails. C’est assez simple, toutes les personnes sous MacOS ont des emails …
Voici donc le petit programme en Python (pour Michel) : il suffit de changer MonUser.
#!/usr/bin/env python3 import email import plistlib import re import glob, os from datetime import datetime from email.utils import parsedate_to_datetime from email.header import Header, decode_header, make_header from elasticsearch import Elasticsearch class Emlx(object): def __init__(self): super(Emlx, self).__init__() self.bytecount = 0 self.msg_data = None self.msg_plist = None def parse(self, filename_path): with open(filename_path, "rb") as f: self.bytecount = int(f.readline().strip()) self.msg_data = email.message_from_bytes(f.read(self.bytecount)) self.msg_plist = plistlib.loads(f.read()) return self.msg_data, self.msg_plist if __name__ == '__main__': msg = Emlx() nb_parse = 0 path_mail = "/Users/MonUser/Library/Mail/V6/" es_keys = "mail" es=Elasticsearch([{'host':'localhost','port':9200}]) for root, dirs, files in os.walk(path_mail): for file in files: if file.endswith(".emlx"): file_full = os.path.join(root, file) message, plist = msg.parse(file_full) statinfo = os.stat(file_full) my_date = message['Date'] my_id = message['Message-ID'] my_server = message['Received'] if my_date is not None and my_date is not Header: my_date_str = datetime.fromtimestamp(parsedate_to_datetime(my_date).timestamp()).strftime('%Y-%m-%dT%H:%M:%S') my_email = str(message['From']) if my_email is not None: my_domain = re.search("@[\w.\-\_]+", str(my_email)) if my_email is not None: my_name = re.search("[\w.\-\_]+@", str(my_email)) if my_domain is not None: #print(my_domain.group()) #print(my_name.group()) json = '{"name":"'+my_name.group()+'","domain":"'+my_domain.group()+'"' else: my_email = my_email.replace(",","") my_email = my_email.replace('"','') json = '{"name":"'+my_email+'","domain":"None"'; if my_date is not None: json = json+',"date":"'+my_date_str+'","size":'+str(statinfo.st_size)+',"id":'+str(nb_parse) else: json = json+',"size":'+str(statinfo.st_size)+',"id":'+str(nb_parse) if my_server is not None and my_server is not Header: ip = re.search(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', str(my_server)) if ip is not None: my_ip = ip.group() json = json+',"ip":"'+str(my_ip)+'"' else: my_ip = "" #ip = re.findall(r'\b25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?\.25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?\.25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?\.25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?\b',my_server) #ip = re.findall( r'[0-9]+(?:\.[0-9]+){1,3}', my_server ) #ip = re.findall(r'[\d.-]+', my_server) else: json = json if my_id is not None and my_id is not Header: my_id =my_id.strip() my_id =my_id.strip('\n') json = json+',"Message-ID":"'+my_id+'","file":"'+file+'"}' else: json = json+',"file":"'+file+'"}' print(json) res = es.index(index=es_keys,doc_type='emlx',id=nb_parse,body=json) nb_parse += 1 #print(plist) print(nb_parse)
Le but de ce programme c’est simplement de mieux comprendre l’API.
Pour le lancer j’ai fait :
sudo python3 ParseEmail.py > email-json.txt
A noter que le Terminal doit avoir certains droits pour que cela fonctionne : https://www.cyber-neurones.org/2019/11/macos-acces-a-library-mail-via-un-terminal/ .
Ensuite pour faire un petit contrôle il suffit de faire : http://localhost:9200/mail/_mappings .
{"mail":{"mappings":{"emlx":{"properties":{"Message-ID":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},"date":{"type":"date"},"domain":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},"file":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},"id":{"type":"long"},"ip":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},"name":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},"size":{"type":"long"}}}}}}
Je viens de lancer le programme … c’est très long, voici ce qu’il a pour l’instant en base (sur les 20 dernières années) :
La version V2 est ici : http://www.cyber-neurones.org/2019/11/macos-python-decouverte-de-lapi-python-elasticsearch-kibana-version-v2/ .