diff --git a/LDIFtoCSV.py b/LDIFtoCSV.py index 9a3744f..468cbe4 100644 --- a/LDIFtoCSV.py +++ b/LDIFtoCSV.py @@ -1,29 +1,30 @@ -""" -* Copyright (c) 2009, Jeffrey Tchang -* -* All rights reserved. -* -* -* THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY -* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY -* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -""" - +#!/usr/bin/env python3 +""" +* Copyright (c) 2009, Jeffrey Tchang +* +* All rights reserved. +* +* +* THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY +* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" + import os import sys import getopt import logging -from ldif import LDIFParser, LDIFWriter -import string - +from ldif import LDIFParser, LDIFWriter +import string +import argparse """ Known Issues @@ -39,319 +40,326 @@ """ - -# The main issue with turning an LDIF into a CSV are multivalued attributes -# The first problem is figuring out if an attribute is multivalued. If it is, you have -# no way of knowing how many values it may have. This poses a problem as with CSVs you -# can only have a single column. - -# My solution to this is to parse through the entire LDIF file twice. The first pass -# figures out how many columns you will need to ensure a full extraction of the data. -# The second pass actually outputs the CSV. Obviously this is 2*O(n). - -# One of the issues with this is that a lot of spreadsheet programs will only support -# a maximum number of columns. Suppose a multivalued attribute had 200 or so values. -# This would eat up 200 columns. OpenOffice's maximum number of columns is 1024. - + +# The main issue with turning an LDIF into a CSV are multivalued attributes +# The first problem is figuring out if an attribute is multivalued. If it is, you have +# no way of knowing how many values it may have. This poses a problem as with CSVs you +# can only have a single column. + +# My solution to this is to parse through the entire LDIF file twice. The first pass +# figures out how many columns you will need to ensure a full extraction of the data. +# The second pass actually outputs the CSV. Obviously this is 2*O(n). + +# One of the issues with this is that a lot of spreadsheet programs will only support +# a maximum number of columns. Suppose a multivalued attribute had 200 or so values. +# This would eat up 200 columns. OpenOffice's maximum number of columns is 1024. + + # A handler that simply throws away any logging messages sent to it class NullHandler(logging.Handler): - def emit(self,record): - pass - - - -# This class handles reading the attributes and storing them into a list. It is used -# for the first pass of the LDIF file -class LDIFAttributeParser(LDIFParser): - - attributeDictionary = dict() - - def __init__(self, input): - LDIFParser.__init__(self, input) - self.attributeDictionary = dict() - - # This function is called whenever an entry is parsed out - def handle(self, dn, entry): - - # Always add the dn attribute with cardinality 1 - self.attributeDictionary["dn"] = 1 - - # Loop through each of the attribute names - for attributeName in entry.keys(): - - # Add the name to the dictionary if it is not already there. Set the value to the cardinality of the - # of the attribute (the number of values that the attribute has) - if( attributeName not in self.attributeDictionary ): - self.attributeDictionary[attributeName] = len(entry[attributeName]) - - # If the attribute name is already in the dictionary, update the cardinality if it is bigger than the - # one I can currently have stored - else: - if( len(entry[attributeName]) > self.attributeDictionary[attributeName] ): - self.attributeDictionary[attributeName] = len(entry[attributeName]) - - -class LDIFCSVParser(LDIFParser): - - attributeDictionary = dict() - attributeList = [] - fieldSeparatorCharacter = "," - textDelimiter = "\"" - maximumColumns = 5 - defaultOutput = sys.stdout - - def __init__(self, input, attributeDictionary, output): - LDIFParser.__init__(self, input) - self.attributeDictionary = attributeDictionary - self.defaultOutput = output - - # This function is called whenever an entry is parsed out - def handle(self, dn, entry): - - # Get a list of all the attributes in the entire LDIF and sort them - allAttributeNames = self.attributeDictionary.keys() - allAttributeNames.sort() - - # Loop through each of the attributes - for attributeName in allAttributeNames: - - # If the attribute is present in the entry print up to a maximum of - # maximumColumns or self.attributeDictionary[attributeName] - # Whichever is larger - numberOfTimesToPrint = self.attributeDictionary[attributeName] - - # This will result in a truncation of the data - if( numberOfTimesToPrint > self.maximumColumns ): - numberOfTimesToPrint = self.maximumColumns - - if( attributeName in entry ): - i = 0 - while( i < numberOfTimesToPrint ): - - if( i < len(entry[attributeName])): - - if( self.check_printable(entry[attributeName][i]) ): - self.defaultOutput.write(self.textDelimiter + entry[attributeName][i] + self.textDelimiter + self.fieldSeparatorCharacter) - else: - self.defaultOutput.write(self.textDelimiter + repr(entry[attributeName][i]) + self.textDelimiter + self.fieldSeparatorCharacter) - else: - self.defaultOutput.write(self.textDelimiter + self.textDelimiter + self.fieldSeparatorCharacter) - - i = i + 1 - - # If the attribute name is dn, print the fully qualified distinguished name - elif(attributeName == "dn"): - self.defaultOutput.write(self.textDelimiter + str(dn) + self.textDelimiter + self.fieldSeparatorCharacter) - - # If the attribute name is not in the entry print fieldSeparatorCharacter(s) - else: - i = 0 - while( i < numberOfTimesToPrint ): - self.defaultOutput.write(self.textDelimiter + self.textDelimiter + self.fieldSeparatorCharacter) - i = i + 1 - - # Print a newline - self.defaultOutput.write("\n") - - def check_printable(self, message): - for char in message: - if (ord(char) > 126 or ord(char) < 32): - return False - return True - + def emit(self, record): + pass + + +# This class handles reading the attributes and storing them into a list. It is used +# for the first pass of the LDIF file +class LDIFAttributeParser(LDIFParser): + + attributeDictionary = dict() + + def __init__(self, input): + LDIFParser.__init__(self, input) + self.attributeDictionary = dict() + + # This function is called whenever an entry is parsed out + def handle(self, dn, entry): + + # Always add the dn attribute with cardinality 1 + self.attributeDictionary["dn"] = 1 + + # Loop through each of the attribute names + for attributeName in entry.keys(): + + # Add the name to the dictionary if it is not already there. Set the value to the cardinality of the + # of the attribute (the number of values that the attribute has) + if attributeName not in self.attributeDictionary: + self.attributeDictionary[attributeName] = len(entry[attributeName]) + + # If the attribute name is already in the dictionary, update the cardinality if it is bigger than the + # one I can currently have stored + else: + if len(entry[attributeName]) > self.attributeDictionary[attributeName]: + self.attributeDictionary[attributeName] = len(entry[attributeName]) + + +class LDIFCSVParser(LDIFParser): + + attributeDictionary = dict() + attributeList = [] + fieldSeparatorCharacter = "," + textDelimiter = '"' + maximumColumns = 5 + defaultOutput = sys.stdout + + def __init__(self, input, attributeDictionary, output): + LDIFParser.__init__(self, input) + self.attributeDictionary = attributeDictionary + self.defaultOutput = output + + # This function is called whenever an entry is parsed out + def handle(self, dn, entry): + + # Get a list of all the attributes in the entire LDIF and sort them + allAttributeNames = list(self.attributeDictionary.keys()) + allAttributeNames.sort() + + # Loop through each of the attributes + for attributeName in allAttributeNames: + + # If the attribute is present in the entry print up to a maximum of + # maximumColumns or self.attributeDictionary[attributeName] + # Whichever is larger + numberOfTimesToPrint = self.attributeDictionary[attributeName] + + # This will result in a truncation of the data + if numberOfTimesToPrint > self.maximumColumns: + numberOfTimesToPrint = self.maximumColumns + + if attributeName in entry: + i = 0 + while i < numberOfTimesToPrint: + + if i < len(entry[attributeName]): + + if self.check_printable(entry[attributeName][i]): + self.defaultOutput.write( + self.textDelimiter + + entry[attributeName][i] + + self.textDelimiter + + self.fieldSeparatorCharacter + ) + else: + self.defaultOutput.write( + self.textDelimiter + + repr(entry[attributeName][i]) + + self.textDelimiter + + self.fieldSeparatorCharacter + ) + else: + self.defaultOutput.write( + self.textDelimiter + + self.textDelimiter + + self.fieldSeparatorCharacter + ) + + i = i + 1 + + # If the attribute name is dn, print the fully qualified distinguished name + elif attributeName == "dn": + self.defaultOutput.write( + self.textDelimiter + + str(dn) + + self.textDelimiter + + self.fieldSeparatorCharacter + ) + + # If the attribute name is not in the entry print fieldSeparatorCharacter(s) + else: + i = 0 + while i < numberOfTimesToPrint: + self.defaultOutput.write( + self.textDelimiter + + self.textDelimiter + + self.fieldSeparatorCharacter + ) + i = i + 1 + + # Print a newline + self.defaultOutput.write("\n") + + def check_printable(self, message): + for char in message: + if ord(char) > 126 or ord(char) < 32: + return False + return True + # Parses an LDIF file to find out all the attribute names as well as how many of each kind of attribute # are in the file. Returns a dictionary of attributes and the maximum number of times that value appears. -def parseLDIFAttributes(filename): - # Open the LDIF file for reading - LDIFFile = open(filename,"rb") - primaryLogger.debug("Opened <%s> for reading" % filename) - - # Create an instance of the attribute parser which will handle LDIF entries - attributeParser = LDIFAttributeParser(LDIFFile) - - # Perform the actual parsing using the AttributeParser - # This first pass is only to obtain the attributes - primaryLogger.debug("Parsing <%s> for attributes" % filename) - attributeParser.parse() - - # Close the file - LDIFFile.close() - primaryLogger.debug("Closed file <%s>" % filename) - - # Return the dictionary of attributes. The key is the attribute name. The value is the - # maximum number of times that value appears - return attributeParser.attributeDictionary - - - - - -def generateCSV(attributeDictionary, filename, output, fieldSeparatorCharacter = ",", textDelimiter = "\"", maximumColumns = 5 ): - # Open the LDIF file for reading - LDIFFile = open(filename,"rb") - primaryLogger.debug("Opened <%s> for reading" % filename) - - # Create an instance of the attribute parser which will handle LDIF entries - CSVParser = LDIFCSVParser(LDIFFile,attributeDictionary,output) - CSVParser.fieldSeparatorCharacter = fieldSeparatorCharacter - CSVParser.textDelimiter = textDelimiter - CSVParser.maximumColumns = maximumColumns - - # Print out the CSV header sorted - headerValues = attributeDictionary.keys() - headerValues.sort() - - # Count of the number of columns this CSV will have - numberOfColumns = 0 - - for columnName in headerValues: - numberOfTimesToPrint = attributeDictionary[columnName] - - # This will result in a truncation of the data - if( numberOfTimesToPrint > maximumColumns ): - numberOfTimesToPrint = maximumColumns - - i = 0 - while(i < numberOfTimesToPrint): - output.write(textDelimiter + columnName + textDelimiter + fieldSeparatorCharacter) - numberOfColumns = numberOfColumns + 1 - i = i + 1 - - # Write a newline after the header - output.write("\n") - - # Print out the main CSV data - CSVParser.parse() - - # Write a newline to end the file - output.write("\n") - -def setupLogging(logfilename=""): - # Create the primaryLogger as a global variable - global primaryLogger - primaryLogger = logging.Logger("primaryLogger",logging.DEBUG) - - # Create a handler to print to the log - if( logfilename != "" ): - fileHandler = logging.FileHandler(logfilename,"w",encoding=None, delay=0) - else: - fileHandler = NullHandler() - - # Set how the handler will print the pretty log events - primaryLoggerFormat = logging.Formatter("[%(asctime)s][%(funcName)s] - %(message)s",'%m/%d/%y %I:%M%p') - fileHandler.setFormatter(primaryLoggerFormat) - - # Append handler to the primaryLoggyouer - primaryLogger.addHandler(fileHandler) - -# Text to describe out this command is used -def usage(): - usage = """ - usage: LDIFtoCSV.py [options] - - -o : File to write output. By default this is set to sys.stdout - -l : File to write logging output. By default there is no logging. - -F : Character to separate the fields by. By default this is a - comma. i.e. -F"," - -D : Character to delimit the text value by. By default this is a - double quote. i.e. -D"\"" - -M : The maximum number of columns a multivalued attribute should - take up (default: 5). This is common with the objectClass - attribute where it can have over 20 values. Do you want to - have 20 columns each with the same heading objectClass or - do you want to limit it. - """ - sys.stdout.write(usage) - - """ - sys.stdout.write("usage: LDIFtoCSV.py [options] \n") - sys.stdout.write("-o \t: File to write output. By default this is set to sys.stdout\n") - sys.stdout.write("-l \t: File to write logging output. By default there is no logging.\n") - sys.stdout.write("-F \t: Character to separate the fields by. By default this is\n\t\t a comma. i.e. -F\",\"\n") - sys.stdout.write("-D \t: Character to delimit the text value by. By default this is a double quote. i.e. -D\"\\\"\"\n") - sys.stdout.write("-M \t: The maximum number of columns a multivalued attribute should take up (default: 5).\n") - sys.stdout.write("\t\t This is common with the objectClass attribute where it can have over 20 values.\n") - sys.stdout.write("\t\t Do you want to have 20 columns each with the same heading objectClass or do you want to limit it.\n") - - sys.stdout.write("\n") - """ - -# Primary function call +def parseLDIFAttributes(filename, verbose=False): + # Open the LDIF file for reading + LDIFFile = open(filename, "rb") + if verbose: + print("Opened <%s> for reading" % filename) + + # Create an instance of the attribute parser which will handle LDIF entries + attributeParser = LDIFAttributeParser(LDIFFile) + + # Perform the actual parsing using the AttributeParser + # This first pass is only to obtain the attributes + if verbose: + print("Parsing <%s> for attributes" % filename) + attributeParser.parse() + + # Close the file + LDIFFile.close() + if verbose: + print("Closed file <%s>" % filename) + + # Return the dictionary of attributes. The key is the attribute name. The value is the + # maximum number of times that value appears + return attributeParser.attributeDictionary + + +def generateCSV( + attributeDictionary, + filename, + output, + fieldSeparatorCharacter=",", + textDelimiter='"', + maximumColumns=5, + verbose=False, +): + # Open the LDIF file for reading + LDIFFile = open(filename, "rb") + + if verbose: + print("Opened <%s> for reading" % filename) + + # this ... isn't great, but occasionally useful + if not textDelimiter: + textDelimiter = "" + + # Create an instance of the attribute parser which will handle LDIF entries + CSVParser = LDIFCSVParser(LDIFFile, attributeDictionary, output) + CSVParser.fieldSeparatorCharacter = fieldSeparatorCharacter + CSVParser.textDelimiter = textDelimiter + CSVParser.maximumColumns = maximumColumns + + # Print out the CSV header sorted + headerValues = list(attributeDictionary.keys()) + headerValues.sort() + + # Count of the number of columns this CSV will have + numberOfColumns = 0 + + for columnName in headerValues: + numberOfTimesToPrint = attributeDictionary[columnName] + + # This will result in a truncation of the data + if numberOfTimesToPrint > maximumColumns: + numberOfTimesToPrint = maximumColumns + + i = 0 + while i < numberOfTimesToPrint: + output.write( + textDelimiter + columnName + textDelimiter + fieldSeparatorCharacter + ) + numberOfColumns = numberOfColumns + 1 + i = i + 1 + + # Write a newline after the header + output.write("\n") + + # Print out the main CSV data + CSVParser.parse() + + # Write a newline to end the file + output.write("\n") + LDIFFile.close() + + +# Primary function call def main(): - # Setup logging to /dev/null incase no log file is specified - setupLogging() - - # Variables to extract from command line (set the defaults here) - outputFilename = "" - fieldSeparatorCharacter = "," - textDelimiter = "\"" - maximumColumns = 5 - - # Use getopt to get all the options that might be present - try: - optionValueList, remainingItems = getopt.getopt(sys.argv[1:], "o:l:F:D:M:") - except getopt.GetoptError, exceptionObject: - print str(exceptionObject) - usage() - sys.exit(2) - - if( len(remainingItems) < 1 ): - print "Error: Expecting single filename argument at end of command line.\n" - usage() - sys.exit(2) - - - # Loop through each tuple returned - for option, value in optionValueList: - - # Setup logging - if option == "-l": - setupLogging(logfilename=value) - primaryLogger.debug("Logging initiated") - - # Get output filename - if option == "-o": - outputFilename = value - - # Get field separator character - if option == "-F": - fieldSeparatorCharacter = value - - # Get text delimiter character - if option == "-D": - textDelimiter = value - - # Get maximum number of columns - if option == "-M": - maximumColumns = int(value) - - primaryLogger.debug("outputFilename: %s" % outputFilename) - primaryLogger.debug("fieldSeparatorCharacter: %s" % fieldSeparatorCharacter) - primaryLogger.debug("textDelimiter: %s" % textDelimiter) - primaryLogger.debug("maximumColumns: %d" % maximumColumns) - - # First pass obtains the attributes inside the LDIF - attributeDictionary = parseLDIFAttributes(remainingItems[0]) - primaryLogger.debug("Parsed attribute dictionary: " + repr(attributeDictionary)) - - # Default output is stdout - output = sys.stdout - - if( outputFilename != "" ): - output = open(outputFilename,"w") - - # Second pass generates the actual CSV - generateCSV(attributeDictionary, remainingItems[0], output,fieldSeparatorCharacter,textDelimiter,maximumColumns) - - # Close the file - output.close() - -# Main entry point of program -if( __name__ == '__main__'): - main() - - - - + # Variables to extract from command line (set the defaults here) + outputFilename = "" + fieldSeparatorCharacter = "," + textDelimiter = '"' + maximumColumns = 5 + + parser = argparse.ArgumentParser() + parser.add_argument( + "-o", + "--output", + type=str, + nargs="?", + default=sys.stdout, + help="File to write output. By default this is set to sys.stdout", + ) + parser.add_argument( + "-F", + "--fieldsep", + type=str, + nargs="?", + default=",", + help='Character to separate the fields by. By default this is a comma. i.e. -F","', + ) + parser.add_argument( + "-D", + "--delimiter", + type=str, + nargs="?", + default='"', + help='Character to delimit the text value by. By default this is a double quote. i.e. -D"""', + ) + parser.add_argument( + "-M", + "--maxcols", + type=int, + nargs="?", + default=5, + help="The maximum number of columns a multivalued attribute should take up (default: 5). This is common with the objectClass attribute where it can have over 20 values. Do you want to have 20 columns each with the same heading objectClass or do you want to limit it", + ) + parser.add_argument( + "-i", "--input", type=str, nargs="?", required=True, help="LDIF file to read" + ) + parser.add_argument( + "-v", "--verbose", action="store_true", default=False, help="increase verbosity" + ) + + args = parser.parse_args() + + if args.verbose: + print("Logging initiated") + + inputFilename = args.input + outputFilename = args.output + fieldSeparatorCharacter = args.fieldsep + textDelimiter = args.delimiter + maximumColumns = args.maxcols + + if args.verbose: + print("outputFilename: %s" % outputFilename) + print("fieldSeparatorCharacter: %s" % fieldSeparatorCharacter) + print("textDelimiter: %s" % textDelimiter) + print("maximumColumns: %d" % maximumColumns) + + # First pass obtains the attributes inside the LDIF + attributeDictionary = parseLDIFAttributes(inputFilename, args.verbose) + if args.verbose: + print("Parsed attribute dictionary: " + repr(attributeDictionary)) + + # Default output is stdout + output = sys.stdout + + if outputFilename != "": + output = open(outputFilename, "w") + + # Second pass generates the actual CSV + generateCSV( + attributeDictionary, + inputFilename, + output, + fieldSeparatorCharacter, + textDelimiter, + maximumColumns, + ) + + # Close the file + output.close() + + +# Main entry point of program +if __name__ == "__main__": + main() diff --git a/LDIFtoCSVUnitTest.py b/LDIFtoCSVUnitTest.py index c02eece..17af882 100644 --- a/LDIFtoCSVUnitTest.py +++ b/LDIFtoCSVUnitTest.py @@ -9,7 +9,7 @@ class LDIFAttributeChecks(unittest.TestCase): def testBasic(self): attributeDictionary = LDIFtoCSV.parseLDIFAttributes(os.path.join (self.sampleLDIFLocation, "Root.ldif")) - print "Parsed attribute dictionary: " + repr(attributeDictionary) + print("Parsed attribute dictionary: " + repr(attributeDictionary)) self.assertEqual(4, len(attributeDictionary)) @@ -37,7 +37,7 @@ def testThreeEntries(self): self.assertEqual(2, attributeDictionary["cn"]) self.assertEqual(1, attributeDictionary["sn"]) - print "Parsed attribute dictionary: " + repr(attributeDictionary) + print("Parsed attribute dictionary: " + repr(attributeDictionary)) if __name__ == "__main__": diff --git a/ldif.py b/ldif.py index 1a68d14..be558b6 100644 --- a/ldif.py +++ b/ldif.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 """ ldif - generate and parse LDIF data (see RFC 2849) @@ -13,7 +14,7 @@ __all__ = [ # constants - 'ldif_pattern', + #'ldif_pattern', # functions 'AttrTypeandValueLDIF','CreateLDIF','ParseLDIF', # classes @@ -23,20 +24,21 @@ 'LDIFCopy', ] -import urlparse,urllib,base64,re,types - -try: - from cStringIO import StringIO -except ImportError: - from StringIO import StringIO +import urllib +import base64 +import re +import types +import io +import codecs attrtype_pattern = r'[\w;.]+(;[\w_-]+)*' attrvalue_pattern = r'(([^,]|\\,)+|".*?")' rdn_pattern = attrtype_pattern + r'[ ]*=[ ]*' + attrvalue_pattern dn_pattern = rdn_pattern + r'([ ]*,[ ]*' + rdn_pattern + r')*[ ]*' -dn_regex = re.compile('^%s$' % dn_pattern) +dn_regex = re.compile(r'^' + dn_pattern + r'$') +dn_regex_bytes = re.compile('^' + dn_pattern + '$') -ldif_pattern = '^((dn(:|::) %(dn_pattern)s)|(%(attrtype_pattern)s(:|::) .*)$)+' % vars() +# ldif_pattern = r'^((dn(:|::) %(dn_pattern)s)|(%(attrtype_pattern)s(:|::) .*)$)+' % vars() MOD_OP_INTEGER = { 'add':0,'delete':1,'replace':2 @@ -52,7 +54,7 @@ valid_changetype_dict[c]=None -SAFE_STRING_PATTERN = '(^(\000|\n|\r| |:|<)|[\000\n\r\200-\377]+|[ ]+$)' +SAFE_STRING_PATTERN = r'(^(\000|\n|\r| |:|<)|[\000\n\r\200-\377]+|[ ]+$)' safe_string_re = re.compile(SAFE_STRING_PATTERN) def is_dn(s): @@ -61,8 +63,12 @@ def is_dn(s): """ if s=='': return 1 - rm = dn_regex.match(s) - return rm!=None and rm.group(0)==s + new_s = codecs.escape_decode(s)[0].decode("utf-8") + # this turns out to create a literal b'UNICODE HERE' string + # but at least you get the right unicode + new_s = new_s.replace("b'", "").rstrip("'") + rm = dn_regex_bytes.match(new_s) + return rm!=None and rm.group(0)==new_s def needs_base64(s): @@ -134,7 +140,7 @@ def _unparseAttrTypeandValue(self,attr_type,attr_value): attr_value attribute value """ - if self._base64_attrs.has_key(attr_type.lower()) or \ + if attr_type.lower() in self._base64_attrs or \ needs_base64(attr_value): # Encode with base64 self._unfoldLDIFLine(':: '.join([attr_type,base64.encodestring(attr_value).replace('\n','')])) @@ -164,7 +170,7 @@ def _unparseChangeRecord(self,modlist): elif mod_len==3: changetype = 'modify' else: - raise ValueError,"modlist item of wrong length" + raise ValueError("modlist item of wrong length") self._unparseAttrTypeandValue('changetype',changetype) for mod in modlist: if mod_len==2: @@ -173,7 +179,7 @@ def _unparseChangeRecord(self,modlist): mod_op,mod_type,mod_vals = mod self._unparseAttrTypeandValue(MOD_OP_STR[mod_op],mod_type) else: - raise ValueError,"Subsequent modlist item of wrong length" + raise ValueError("Subsequent modlist item of wrong length") if mod_vals: for mod_val in mod_vals: self._unparseAttrTypeandValue(mod_type,mod_val) @@ -199,7 +205,7 @@ def unparse(self,dn,record): elif isinstance(record,types.ListType): self._unparseChangeRecord(record) else: - raise ValueError, "Argument record must be dictionary or list" + raise ValueError("Argument record must be dictionary or list") # Write empty line separating the records self._output_file.write(self._line_sep) # Count records written @@ -223,7 +229,7 @@ def CreateLDIF(dn,record,base64_attrs=None,cols=76): Specifies how many columns a line may have before it's folded into many lines. """ - f = StringIO() + f = io.BytesIO() ldif_writer = LDIFWriter(f,base64_attrs,cols,'\n') ldif_writer.unparse(dn,record) s = f.getvalue() @@ -293,8 +299,12 @@ def _unfoldLDIFLine(self): """ Unfold several folded lines with trailing space into one line """ - unfolded_lines = [ self._stripLineSep(self._line) ] - self._line = self._input_file.readline() + # do we have strings or bytes? + try: + unfolded_lines = [ self._stripLineSep(str(self._line, 'utf-8')) ] + except TypeError: + unfolded_lines = [ self._stripLineSep(self._line) ] + self._line = self._input_file.readline().decode('utf-8') while self._line and self._line[0]==' ': unfolded_lines.append(self._stripLineSep(self._line[1:])) self._line = self._input_file.readline() @@ -322,15 +332,16 @@ def _parseAttrTypeandValue(self): value_spec = unfolded_line[colon_pos:colon_pos+2] if value_spec=='::': # attribute value needs base64-decoding - attr_value = base64.decodestring(unfolded_line[colon_pos+2:]) +# attr_value = base64.decodestring(unfolded_line[colon_pos+2:]) + attr_value = str(base64.b64decode(bytes(unfolded_line[colon_pos+2:], "utf-8"))) #attr_value = unfolded_line[colon_pos+2:] elif value_spec==':<': # fetch attribute value from URL url = unfolded_line[colon_pos+2:].strip() attr_value = None if self._process_url_schemes: - u = urlparse.urlparse(url) - if self._process_url_schemes.has_key(u[0]): + u = urllib.parse.urlparse(url) + if u[0] in self._process_url_schemes: attr_value = urllib.urlopen(url).read() elif value_spec==':\r\n' or value_spec=='\n': attr_value = '' @@ -356,25 +367,24 @@ def parse(self): if attr_type=='dn': # attr type and value pair was DN of LDIF record if dn!=None: - raise ValueError, 'Two lines starting with dn: in one record.' + raise ValueError('Two lines starting with dn: in one record.') if not is_dn(attr_value): - raise ValueError, 'No valid string-representation of distinguished name %s.' % (repr(attr_value)) + raise ValueError('No valid string-representation of distinguished name %s.' % (repr(attr_value))) dn = attr_value elif attr_type=='version' and dn is None: version = 1 elif attr_type=='changetype': # attr type and value pair was DN of LDIF record if dn is None: - raise ValueError, 'Read changetype: before getting valid dn: line.' + raise ValueError('Read changetype: before getting valid dn: line.') if changetype!=None: - raise ValueError, 'Two lines starting with changetype: in one record.' - if not valid_changetype_dict.has_key(attr_value): - raise ValueError, 'changetype value %s is invalid.' % (repr(attr_value)) + raise ValueError('Two lines starting with changetype: in one record.') + if attr_value not in valid_changetype_dict: + raise ValueError('changetype value %s is invalid.' % (repr(attr_value))) changetype = attr_value - elif attr_value!=None and \ - not self._ignored_attr_types.has_key(attr_type.lower()): + elif attr_value!=None and attr_type.lower() not in self._ignored_attr_types: # Add the attribute to the entry if not ignored attribute - if entry.has_key(attr_type): + if attr_type in entry: entry[attr_type].append(attr_value) else: entry[attr_type]=[attr_value] diff --git a/setup.py b/setup.py index ba38594..026f7c9 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,7 @@ from distutils.core import setup -import py2exe, sys, os +import sys, os +import platform + manifest = """ @@ -16,13 +18,14 @@ """ -setup( - console = [ - { - "script": "LDIFtoCSV.py", - "other_resources": [(24,1,manifest)] - } - ], - options = {'py2exe': {'bundle_files': 1}}, - zipfile = None -) +if platform.system() == "Windows": + import py2exe + + setup( + name="ldiftocsv", + console=[{"script": "LDIFtoCSV.py", "other_resources": [(24, 1, manifest)]}], + options={"py2exe": {"bundle_files": 1}}, + zipfile=None, + ) +else: + setup(name="ldiftocsv", scripts=["ldiftocsv.py"], py_modules=["ldif"])