from model import AmbarFileMeta, AmbarFileContent from zipfile import ZipFile, ZipInfo from datetime import datetime from hashlib import sha256 import hashlib import re import io class ArchiveProcessor(): def __init__(self, Logger, ApiProxy): self.logger = Logger self.apiProxy = ApiProxy def Process(self, FileData, FileMeta, SourceId): self.logger.LogMessage('verbose','unzipping {0}'.format(FileMeta.full_name)) ##TODO: Get fileRegex from crawler settings fileRegex = re.compile('(\\.doc[a-z]*$)|(\\.xls[a-z]*$)|(\\.txt$)|(\\.csv$)|(\\.htm[a-z]*$)|(\\.ppt[a-z]*$)|(\\.pdf$)|(\\.msg$)|(\\.zip$)|(\\.eml$)|(\\.rtf$)|(\\.md$)|(\\.png$)|(\\.bmp$)|(\\.tif[f]*$)|(\\.jp[e]*g$)',re.I) try: with ZipFile(io.BytesIO(FileData)) as zipFile: for zipFileInfo in zipFile.infolist(): try: unicodeName = zipFileInfo.filename.encode('CP437').decode('CP866') except: unicodeName = zipFileInfo.filename if not fileRegex.search(unicodeName): self.logger.LogMessage('verbose','ignoring {0}/{1}'.format(FileMeta.full_name, unicodeName)) continue fullNameInArchive = '{0}/{1}'.format(FileMeta.full_name, unicodeName) createUpdateTime = datetime( zipFileInfo.date_time[0], zipFileInfo.date_time[1], zipFileInfo.date_time[2], zipFileInfo.date_time[3], zipFileInfo.date_time[4], zipFileInfo.date_time[5]) fileData=zipFile.open(zipFileInfo.filename).read() sha = sha256(fileData).hexdigest() size = zipFileInfo.file_size if size == 0: continue ## checking content existance apiResp = self.apiProxy.CheckIfParsedAmbarFileContentExists(sha) if not apiResp.Success: self.logger.LogMessage('error', 'error checking content existance {0} {1}'.format(fullNameInArchive, apiResp.message)) continue if not (apiResp.Found or apiResp.NotFound): self.logger.LogMessage('error', 'unexpected response on checking content existance {0} {1} {2}'.format(fullNameInArchive, apiResp.code, apiResp.message)) continue if apiResp.NotFound: self.logger.LogMessage('verbose', 'content not found {0}'.format(fullNameInArchive)) ## creating content createContentApiResp = self.apiProxy.CreateAmbarFileContent(fileData, sha) if not createContentApiResp.Success: self.logger.LogMessage('error', 'error creating content {0} {1}'.format(fullNameInArchive, createContentApiResp.message)) continue if not (createContentApiResp.Found or createContentApiResp.Created): self.logger.LogMessage('error', 'unexpected response on create content {0} {1} {2}'.format(fullNameInArchive, createContentApiResp.code, createContentApiResp.message)) continue if createContentApiResp.Found: self.logger.LogMessage('verbose', 'content found {0}'.format(fullNameInArchive)) if createContentApiResp.Created: self.logger.LogMessage('verbose', 'content created {0}'.format(fullNameInArchive)) if apiResp.Found: self.logger.LogMessage('verbose', 'content found {0}'.format(fullNameInArchive)) ## sending meta back to queue fileMeta = AmbarFileMeta.InitWithoutId(createUpdateTime, createUpdateTime, unicodeName, fullNameInArchive, FileMeta.source_id) apiResp = self.apiProxy.EnqueueAmbarFileMeta(fileMeta, sha, SourceId) if not apiResp.Success: self.logger.LogMessage('error', 'error adding meta {0} {1}'.format(fileMeta.full_name, apiResp.message)) continue if apiResp.BadRequest: self.logger.LogMessage('verbose', 'bad meta, ignoring... {0}'.format(fileMeta.full_name)) continue if apiResp.InsufficientStorage: self.logger.LogMessage('verbose', 'insufficient storage'.format(fileMeta.full_name)) continue if not apiResp.Ok: self.logger.LogMessage('error', 'unexpected response on adding meta {0} {1} {2}'.format(fileMeta.full_name, apiResp.code, apiResp.message)) continue self.logger.LogMessage('verbose', 'meta added {0}'.format(fileMeta.full_name)) except Exception as ex: self.logger.LogMessage('info','unable to unpack {0} {1}'.format(FileMeta.full_name, ex))