MacOS : Python : Découverte de l’API Python Elasticsearch

209 x served & 35 x viewed

J’ai voulu faire un premier programme afin de découvrir l’API Elasticsearch, comme base d’information j’ai pris mes emails. C’est assez simple, toutes les personnes sous MacOS ont des emails …

Voici donc le petit programme en Python (pour Michel) : il suffit de changer #ff0000;">MonUser.

#!/usr/bin/env python3

import email
import plistlib
import re
import glob, os
from datetime import datetime
from email.utils import parsedate_to_datetime
from email.header import Header, decode_header, make_header
from elasticsearch import Elasticsearch 

class Emlx(object):
        def __init__(self):
            super(Emlx, self).__init__()
            self.bytecount = 0
            self.msg_data = None
            self.msg_plist = None

        def parse(self, filename_path):
            with open(filename_path, "rb") as f:
                self.bytecount = int(f.readline().strip())
                self.msg_data = email.message_from_bytes(f.read(self.bytecount))
                self.msg_plist = plistlib.loads(f.read())
            return self.msg_data, self.msg_plist

if __name__ == '__main__':
   msg = Emlx()
   nb_parse = 0
   path_mail = "/Users/#ff0000;">MonUser/Library/Mail/V6/"
   es_keys = "mail"
   es=Elasticsearch([{'host':'localhost','port':9200}])
   for root, dirs, files in os.walk(path_mail):
      for file in files:
          if file.endswith(".emlx"):
             file_full = os.path.join(root, file)
             message, plist = msg.parse(file_full)
             statinfo = os.stat(file_full)
             my_date = message['Date']
             my_id = message['Message-ID']
             my_server = message['Received']
             if my_date is not None and my_date is not Header:
                 my_date_str = datetime.fromtimestamp(parsedate_to_datetime(my_date).timestamp()).strftime('%Y-%m-%dT%H:%M:%S')
             my_email = str(message['From'])
             if my_email is not None:
                 my_domain = re.search("@[\w.\-\_]+", str(my_email))
             if my_email is not None:
                 my_name = re.search("[\w.\-\_]+@", str(my_email))
             if my_domain is not None:
                 #print(my_domain.group())
                 #print(my_name.group())
                 json = '{"name":"'+my_name.group()+'","domain":"'+my_domain.group()+'"'
             else:
                 my_email = my_email.replace(",","")
                 my_email = my_email.replace('"','')
                 json = '{"name":"'+my_email+'","domain":"None"';
             if my_date is not None:
                 json = json+',"date":"'+my_date_str+'","size":'+str(statinfo.st_size)+',"id":'+str(nb_parse)
             else:
                 json = json+',"size":'+str(statinfo.st_size)+',"id":'+str(nb_parse)
             if my_server is not None and my_server is not Header:
                 ip = re.search(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', str(my_server))
                 if ip is not None:
                    my_ip = ip.group()
                    json = json+',"ip":"'+str(my_ip)+'"'
                 else:
                    my_ip = ""
                 #ip = re.findall(r'\b25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?\.25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?\.25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?\.25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?\b',my_server)
                 #ip = re.findall( r'[0-9]+(?:\.[0-9]+){1,3}', my_server )
                 #ip = re.findall(r'[\d.-]+', my_server) 
             else:
                 json = json
             if my_id is not None and my_id is not Header:
                 my_id =my_id.strip()
                 my_id =my_id.strip('\n')
                 json = json+',"Message-ID":"'+my_id+'","file":"'+file+'"}'
             else:
                 json = json+',"file":"'+file+'"}'
             print(json)
             res = es.index(index=es_keys,doc_type='emlx',id=nb_parse,body=json)
             nb_parse += 1
             #print(plist)
   print(nb_parse)

Le but de ce programme c’est simplement de mieux comprendre l’API.
Pour le lancer j’ai fait :

sudo python3 ParseEmail.py > email-json.txt

A noter que le Terminal doit avoir certains droits pour que cela fonctionne : https://www.cyber-neurones.org/2019/11/macos-acces-a-library-mail-via-un-terminal/ .

Ensuite pour faire un petit contrôle il suffit de faire : http://localhost:9200/mail/_mappings .

{"mail":{"mappings":{"emlx":{"properties":{"Message-ID":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},"date":{"type":"date"},"domain":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},"file":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},"id":{"type":"long"},"ip":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},"name":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},"size":{"type":"long"}}}}}}

Je viens de lancer le programme … c’est très long, voici ce qu’il a pour l’instant en base (sur les 20 dernières années) :

En mode Histogramme :

1 réflexion sur « MacOS : Python : Découverte de l’API Python Elasticsearch »

Laisser un commentaire

Votre adresse e-mail ne sera pas publiée.

Time limit is exhausted. Please reload CAPTCHA.