You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ambar2/Pipeline/containerprocessors/archiveprocessor.py

106 lines
5.4 KiB
Python

from model import AmbarFileMeta, AmbarFileContent
from zipfile import ZipFile, ZipInfo
from datetime import datetime
from hashlib import sha256
import hashlib
import re
import io
class ArchiveProcessor():
def __init__(self, Logger, ApiProxy):
self.logger = Logger
self.apiProxy = ApiProxy
def Process(self, FileData, FileMeta, SourceId):
self.logger.LogMessage('verbose','unzipping {0}'.format(FileMeta.full_name))
##TODO: Get fileRegex from crawler settings
fileRegex = re.compile('(\\.doc[a-z]*$)|(\\.xls[a-z]*$)|(\\.txt$)|(\\.csv$)|(\\.htm[a-z]*$)|(\\.ppt[a-z]*$)|(\\.pdf$)|(\\.msg$)|(\\.zip$)|(\\.eml$)|(\\.rtf$)|(\\.md$)|(\\.png$)|(\\.bmp$)|(\\.tif[f]*$)|(\\.jp[e]*g$)',re.I)
try:
with ZipFile(io.BytesIO(FileData)) as zipFile:
for zipFileInfo in zipFile.infolist():
try:
unicodeName = zipFileInfo.filename.encode('CP437').decode('CP866')
except:
unicodeName = zipFileInfo.filename
if not fileRegex.search(unicodeName):
self.logger.LogMessage('verbose','ignoring {0}/{1}'.format(FileMeta.full_name, unicodeName))
continue
fullNameInArchive = '{0}/{1}'.format(FileMeta.full_name, unicodeName)
createUpdateTime = datetime(
zipFileInfo.date_time[0],
zipFileInfo.date_time[1],
zipFileInfo.date_time[2],
zipFileInfo.date_time[3],
zipFileInfo.date_time[4],
zipFileInfo.date_time[5])
fileData=zipFile.open(zipFileInfo.filename).read()
sha = sha256(fileData).hexdigest()
size = zipFileInfo.file_size
if size == 0:
continue
## checking content existance
apiResp = self.apiProxy.CheckIfParsedAmbarFileContentExists(sha)
if not apiResp.Success:
self.logger.LogMessage('error', 'error checking content existance {0} {1}'.format(fullNameInArchive, apiResp.message))
continue
if not (apiResp.Found or apiResp.NotFound):
self.logger.LogMessage('error', 'unexpected response on checking content existance {0} {1} {2}'.format(fullNameInArchive, apiResp.code, apiResp.message))
continue
if apiResp.NotFound:
self.logger.LogMessage('verbose', 'content not found {0}'.format(fullNameInArchive))
## creating content
createContentApiResp = self.apiProxy.CreateAmbarFileContent(fileData, sha)
if not createContentApiResp.Success:
self.logger.LogMessage('error', 'error creating content {0} {1}'.format(fullNameInArchive, createContentApiResp.message))
continue
if not (createContentApiResp.Found or createContentApiResp.Created):
self.logger.LogMessage('error', 'unexpected response on create content {0} {1} {2}'.format(fullNameInArchive, createContentApiResp.code, createContentApiResp.message))
continue
if createContentApiResp.Found:
self.logger.LogMessage('verbose', 'content found {0}'.format(fullNameInArchive))
if createContentApiResp.Created:
self.logger.LogMessage('verbose', 'content created {0}'.format(fullNameInArchive))
if apiResp.Found:
self.logger.LogMessage('verbose', 'content found {0}'.format(fullNameInArchive))
## sending meta back to queue
fileMeta = AmbarFileMeta.InitWithoutId(createUpdateTime, createUpdateTime, unicodeName, fullNameInArchive, FileMeta.source_id)
apiResp = self.apiProxy.EnqueueAmbarFileMeta(fileMeta, sha, SourceId)
if not apiResp.Success:
self.logger.LogMessage('error', 'error adding meta {0} {1}'.format(fileMeta.full_name, apiResp.message))
continue
if apiResp.BadRequest:
self.logger.LogMessage('verbose', 'bad meta, ignoring... {0}'.format(fileMeta.full_name))
continue
if apiResp.InsufficientStorage:
self.logger.LogMessage('verbose', 'insufficient storage'.format(fileMeta.full_name))
continue
if not apiResp.Ok:
self.logger.LogMessage('error', 'unexpected response on adding meta {0} {1} {2}'.format(fileMeta.full_name, apiResp.code, apiResp.message))
continue
self.logger.LogMessage('verbose', 'meta added {0}'.format(fileMeta.full_name))
except Exception as ex:
self.logger.LogMessage('info','unable to unpack {0} {1}'.format(FileMeta.full_name, ex))