OwlCyberSecurity - MANAGER
Edit File: anonymize.py
#!/usr/bin/env python ## # Copyright (c) 2006-2017 Apple Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ## from __future__ import print_function from __future__ import with_statement from getopt import getopt, GetoptError from subprocess import Popen, PIPE, STDOUT import datetime import hashlib import os import random import shutil import sys import urllib import uuid import xattr import zlib from plistlib import readPlistFromString from pycalendar.icalendar.calendar import Calendar from pycalendar.parameter import Parameter COPY_CAL_XATTRS = ( 'WebDAV:{DAV:}resourcetype', 'WebDAV:{urn:ietf:params:xml:ns:caldav}calendar-timezone', 'WebDAV:{http:%2F%2Fapple.com%2Fns%2Fical%2F}calendar-color', 'WebDAV:{http:%2F%2Fapple.com%2Fns%2Fical%2F}calendar-order', ) COPY_EVENT_XATTRS = ('WebDAV:{DAV:}getcontenttype',) def usage(e=None): if e: print(e) print("") name = os.path.basename(sys.argv[0]) print("usage: %s [options] source destination" % (name,)) print("") print(" Anonymizes calendar data") print("") print(" source and destination should refer to document root directories") print("") print("options:") print(" -h --help: print this help and exit") print(" -n --node <node>: Directory node (defaults to /Search)") print("") if e: sys.exit(64) else: sys.exit(0) def main(): try: (optargs, args) = getopt( sys.argv[1:], "hn:", [ "help", "node=", ], ) except GetoptError, e: usage(e) # # Get configuration # directoryNode = "/Search" for opt, arg in optargs: if opt in ("-h", "--help"): usage() elif opt in ("-n", "--node"): directoryNode = arg if len(args) != 2: usage("Source and destination directories must be specified.") sourceDirectory, destDirectory = args directoryMap = DirectoryMap(directoryNode) anonymizeRoot(directoryMap, sourceDirectory, destDirectory) directoryMap.printStats() directoryMap.dumpDsImports(os.path.join(destDirectory, "dsimports")) def anonymizeRoot(directoryMap, sourceDirectory, destDirectory): # sourceDirectory and destDirectory are DocumentRoots print("Anonymizing calendar data from %s into %s" % (sourceDirectory, destDirectory)) homes = 0 calendars = 0 resources = 0 if not os.path.exists(sourceDirectory): print("Can't find source: %s" % (sourceDirectory,)) sys.exit(1) if not os.path.exists(destDirectory): os.makedirs(destDirectory) sourceCalRoot = os.path.join(sourceDirectory, "calendars") destCalRoot = os.path.join(destDirectory, "calendars") if not os.path.exists(destCalRoot): os.makedirs(destCalRoot) sourceUidHomes = os.path.join(sourceCalRoot, "__uids__") if os.path.exists(sourceUidHomes): destUidHomes = os.path.join(destCalRoot, "__uids__") if not os.path.exists(destUidHomes): os.makedirs(destUidHomes) homeList = [] for first in os.listdir(sourceUidHomes): if len(first) == 2: firstPath = os.path.join(sourceUidHomes, first) for second in os.listdir(firstPath): if len(second) == 2: secondPath = os.path.join(firstPath, second) for home in os.listdir(secondPath): record = directoryMap.lookupCUA(home) if not record: print("Couldn't find %s, skipping." % (home,)) continue sourceHome = os.path.join(secondPath, home) destHome = os.path.join( destUidHomes, record['guid'][0:2], record['guid'][2:4], record['guid']) homeList.append((sourceHome, destHome, record)) else: home = first sourceHome = os.path.join(sourceUidHomes, home) if not os.path.isdir(sourceHome): continue record = directoryMap.lookupCUA(home) if not record: print("Couldn't find %s, skipping." % (home,)) continue sourceHome = os.path.join(sourceUidHomes, home) destHome = os.path.join(destUidHomes, record['guid']) homeList.append((sourceHome, destHome, record)) print("Processing %d calendar homes..." % (len(homeList),)) for sourceHome, destHome, record in homeList: quotaUsed = 0 if not os.path.exists(destHome): os.makedirs(destHome) homes += 1 # Iterate calendars freeBusies = [] for cal in os.listdir(sourceHome): # Skip these: if cal in ("dropbox", "notifications"): continue # Don't include these in freebusy list if cal not in ("inbox", "outbox"): freeBusies.append(cal) sourceCal = os.path.join(sourceHome, cal) destCal = os.path.join(destHome, cal) if not os.path.exists(destCal): os.makedirs(destCal) calendars += 1 # Copy calendar xattrs for attr, value in xattr.xattr(sourceCal).iteritems(): if attr in COPY_CAL_XATTRS: xattr.setxattr(destCal, attr, value) # Copy index sourceIndex = os.path.join(sourceCal, ".db.sqlite") destIndex = os.path.join(destCal, ".db.sqlite") if os.path.exists(sourceIndex): shutil.copyfile(sourceIndex, destIndex) # Iterate resources for resource in os.listdir(sourceCal): if resource.startswith("."): continue sourceResource = os.path.join(sourceCal, resource) # Skip directories if os.path.isdir(sourceResource): continue with open(sourceResource) as res: data = res.read() data = anonymizeData(directoryMap, data) if data is None: # Ignore data we can't parse continue destResource = os.path.join(destCal, resource) with open(destResource, "w") as res: res.write(data) quotaUsed += len(data) for attr, value in xattr.xattr(sourceResource).iteritems(): if attr in COPY_EVENT_XATTRS: xattr.setxattr(destResource, attr, value) # Set new etag xml = "<?xml version='1.0' encoding='UTF-8'?>\r\n<getcontentmd5 xmlns='http://twistedmatrix.com/xml_namespace/dav/'>%s</getcontentmd5>\r\n" % (hashlib.md5(data).hexdigest(),) xattr.setxattr(destResource, "WebDAV:{http:%2F%2Ftwistedmatrix.com%2Fxml_namespace%2Fdav%2F}getcontentmd5", zlib.compress(xml)) resources += 1 # Store new ctag on calendar xml = "<?xml version='1.0' encoding='UTF-8'?>\r\n<getctag xmlns='http://calendarserver.org/ns/'>%s</getctag>\r\n" % (str(datetime.datetime.now()),) xattr.setxattr(destCal, "WebDAV:{http:%2F%2Fcalendarserver.org%2Fns%2F}getctag", zlib.compress(xml)) # Calendar home quota xml = "<?xml version='1.0' encoding='UTF-8'?>\r\n<quota-used xmlns='http://twistedmatrix.com/xml_namespace/dav/private/'>%d</quota-used>\r\n" % (quotaUsed,) xattr.setxattr(destHome, "WebDAV:{http:%2F%2Ftwistedmatrix.com%2Fxml_namespace%2Fdav%2Fprivate%2F}quota-used", zlib.compress(xml)) # Inbox free busy calendars list destInbox = os.path.join(destHome, "inbox") if not os.path.exists(destInbox): os.makedirs(destInbox) xml = "<?xml version='1.0' encoding='UTF-8'?><calendar-free-busy-set xmlns='urn:ietf:params:xml:ns:caldav'>\n" for freeBusy in freeBusies: xml += "<href xmlns='DAV:'>/calendars/__uids__/%s/%s/</href>\n" % (record['guid'], freeBusy) xml += "</calendar-free-busy-set>\n" xattr.setxattr( destInbox, "WebDAV:{urn:ietf:params:xml:ns:caldav}calendar-free-busy-set", zlib.compress(xml) ) if not (homes % 100): print(" %d..." % (homes,)) print("Done.") print("") print("Calendar totals:") print(" Calendar homes: %d" % (homes,)) print(" Calendars: %d" % (calendars,)) print(" Events: %d" % (resources,)) print("") def anonymizeData(directoryMap, data): try: pyobj = Calendar.parseText(data) except Exception, e: print("Failed to parse (%s): %s" % (e, data)) return None # Delete property from the top level try: for prop in pyobj.getProperties('x-wr-calname'): prop.setValue(anonymize(prop.getValue().getValue())) except KeyError: pass for comp in pyobj.getComponents(): # Replace with anonymized CUAs: for propName in ('organizer', 'attendee'): try: for prop in tuple(comp.getProperties(propName)): cua = prop.getValue().getValue() record = directoryMap.lookupCUA(cua) if record is None: # print("Can't find record for", cua) record = directoryMap.addRecord(cua=cua) if record is None: comp.removeProperty(prop) continue prop.setValue("urn:uuid:%s" % (record['guid'],)) if prop.hasParameter('X-CALENDARSERVER-EMAIL'): prop.replaceParameter(Parameter('X-CALENDARSERVER-EMAIL', record['email'])) else: prop.removeParameters('EMAIL') prop.addParameter(Parameter('EMAIL', record['email'])) prop.removeParameters('CN') prop.addParameter(Parameter('CN', record['name'])) except KeyError: pass # Replace with anonymized text: for propName in ('summary', 'location', 'description'): try: for prop in comp.getProperties(propName): prop.setValue(anonymize(prop.getValue().getValue())) except KeyError: pass # Replace with anonymized URL: try: for prop in comp.getProperties('url'): prop.setValue("http://example.com/%s/" % (anonymize(prop.getValue().getValue()),)) except KeyError: pass # Remove properties: for propName in ('x-apple-dropbox', 'attach'): try: for prop in tuple(comp.getProperties(propName)): comp.removeProperty(prop) except KeyError: pass return pyobj.getText(includeTimezones=Calendar.ALL_TIMEZONES) class DirectoryMap(object): def __init__(self, node): self.map = {} self.byType = { 'users': [], 'groups': [], 'locations': [], 'resources': [], } self.counts = { 'users': 0, 'groups': 0, 'locations': 0, 'resources': 0, 'unknown': 0, } self.strings = { 'users': ('Users', 'user'), 'groups': ('Groups', 'group'), 'locations': ('Places', 'location'), 'resources': ('Resources', 'resource'), } print("Fetching records from directory: %s" % (node,)) for internalType, (recordType, _ignore_friendlyType) in self.strings.iteritems(): print(" %s..." % (internalType,)) child = Popen( args=[ "/usr/bin/dscl", "-plist", node, "-readall", "/%s" % (recordType,), "GeneratedUID", "RecordName", "EMailAddress", "GroupMembers" ], stdout=PIPE, stderr=STDOUT, ) output, error = child.communicate() if child.returncode: raise DirectoryError(error) else: records = readPlistFromString(output) random.shuffle(records) # so we don't go alphabetically for record in records: origGUID = record.get('dsAttrTypeStandard:GeneratedUID', [None])[0] if not origGUID: continue origRecordNames = record['dsAttrTypeStandard:RecordName'] origEmails = record.get('dsAttrTypeStandard:EMailAddress', []) origMembers = record.get('dsAttrTypeStandard:GroupMembers', []) self.addRecord( internalType=internalType, guid=origGUID, names=origRecordNames, emails=origEmails, members=origMembers) print("Done.") print("") def addRecord( self, internalType="users", guid=None, names=None, emails=None, members=None, cua=None ): if cua: keys = [self.cua2key(cua)] self.counts['unknown'] += 1 else: keys = self.getKeys(guid, names, emails) if keys: self.counts[internalType] += 1 count = self.counts[internalType] namePrefix = randomName(6) typeStr = self.strings[internalType][1] record = { 'guid': str(uuid.uuid4()).upper(), 'name': "%s %s%d" % (namePrefix, typeStr, count,), 'first': namePrefix, 'last': "%s%d" % (typeStr, count,), 'recordName': "%s%d" % (typeStr, count,), 'email': ("%s%d@example.com" % (typeStr, count,)), 'type': self.strings[internalType][0], 'cua': cua, 'members': members, } for key in keys: self.map[key] = record self.byType[internalType].append(record) return record else: return None def getKeys(self, guid, names, emails): keys = [] if guid: keys.append(guid.lower()) if names: for name in names: try: name = name.encode('utf-8') name = urllib.quote(name).lower() keys.append(name) except: # print("Failed to urllib.quote( ) %s. Skipping." % (name,)) pass if emails: for email in emails: email = email.lower() keys.append(email) return keys def cua2key(self, cua): key = cua.lower() if key.startswith("mailto:"): key = key[7:] elif key.startswith("urn:uuid:"): key = key[9:] elif (key.startswith("/") or key.startswith("http")): key = key.rstrip("/") key = key.split("/")[-1] return key def lookupCUA(self, cua): key = self.cua2key(cua) if key and key in self.map: return self.map[key] else: return None def printStats(self): print("Directory totals:") for internalType, (recordType, ignore) in self.strings.iteritems(): print(" %s: %d" % (recordType, self.counts[internalType])) unknown = self.counts['unknown'] if unknown: print(" Principals not found in directory: %d" % (unknown,)) def dumpDsImports(self, dirPath): if not os.path.exists(dirPath): os.makedirs(dirPath) uid = 1000000 filePath = os.path.join(dirPath, "users.dsimport") with open(filePath, "w") as out: out.write("0x0A 0x5C 0x3A 0x2C dsRecTypeStandard:Users 12 dsAttrTypeStandard:RecordName dsAttrTypeStandard:AuthMethod dsAttrTypeStandard:Password dsAttrTypeStandard:UniqueID dsAttrTypeStandard:GeneratedUID dsAttrTypeStandard:PrimaryGroupID dsAttrTypeStandard:RealName dsAttrTypeStandard:FirstName dsAttrTypeStandard:LastName dsAttrTypeStandard:NFSHomeDirectory dsAttrTypeStandard:UserShell dsAttrTypeStandard:EMailAddress\n") for record in self.byType['users']: fields = [] fields.append(record['recordName']) fields.append("dsAuthMethodStandard\\:dsAuthClearText") fields.append("test") # password fields.append(str(uid)) fields.append(record['guid']) fields.append("20") # primary group id fields.append(record['name']) fields.append(record['first']) fields.append(record['last']) fields.append("/var/empty") fields.append("/usr/bin/false") fields.append(record['email']) out.write(":".join(fields)) out.write("\n") uid += 1 gid = 2000000 filePath = os.path.join(dirPath, "groups.dsimport") with open(filePath, "w") as out: out.write("0x0A 0x5C 0x3A 0x2C dsRecTypeStandard:Groups 5 dsAttrTypeStandard:RecordName dsAttrTypeStandard:PrimaryGroupID dsAttrTypeStandard:GeneratedUID dsAttrTypeStandard:RealName dsAttrTypeStandard:GroupMembership\n") for record in self.byType['groups']: fields = [] fields.append(record['recordName']) fields.append(str(gid)) fields.append(record['guid']) fields.append(record['name']) anonMembers = [] for member in record['members']: memberRec = self.lookupCUA("urn:uuid:%s" % (member,)) if memberRec: anonMembers.append(memberRec['guid']) if anonMembers: # skip empty groups fields.append(",".join(anonMembers)) out.write(":".join(fields)) out.write("\n") gid += 1 filePath = os.path.join(dirPath, "resources.dsimport") with open(filePath, "w") as out: out.write("0x0A 0x5C 0x3A 0x2C dsRecTypeStandard:Resources 3 dsAttrTypeStandard:RecordName dsAttrTypeStandard:GeneratedUID dsAttrTypeStandard:RealName\n") for record in self.byType['resources']: fields = [] fields.append(record['recordName']) fields.append(record['guid']) fields.append(record['name']) out.write(":".join(fields)) out.write("\n") filePath = os.path.join(dirPath, "places.dsimport") with open(filePath, "w") as out: out.write("0x0A 0x5C 0x3A 0x2C dsRecTypeStandard:Places 3 dsAttrTypeStandard:RecordName dsAttrTypeStandard:GeneratedUID dsAttrTypeStandard:RealName\n") for record in self.byType['locations']: fields = [] fields.append(record['recordName']) fields.append(record['guid']) fields.append(record['name']) out.write(":".join(fields)) out.write("\n") class DirectoryError(Exception): """ Error trying to access dscl """ class DatabaseError(Exception): """ Error trying to access sqlite3 """ def anonymize(text): """ Return a string whose value is the hex digest of text, repeated as needed to create a string of the same length as text. Useful for anonymizing strings in a deterministic manner. """ if isinstance(text, unicode): try: text = text.encode('utf-8') except UnicodeEncodeError: print("Failed to anonymize:", text) text = "Anonymize me!" h = hashlib.md5(text) h = h.hexdigest() l = len(text) return (h * ((l / 32) + 1))[:-(32 - (l % 32))] nameChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" def randomName(length): l = [] for _ignore in xrange(length): l.append(random.choice(nameChars)) return "".join(l) if __name__ == "__main__": main()