diff --git a/LDIFtoCSV.py b/LDIFtoCSV.py index 9a3744f..7cd6cb0 100644 --- a/LDIFtoCSV.py +++ b/LDIFtoCSV.py @@ -1,28 +1,28 @@ -""" -* Copyright (c) 2009, Jeffrey Tchang -* -* All rights reserved. -* -* -* THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY -* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY -* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -""" - +""" +* Copyright (c) 2009, Jeffrey Tchang +* +* All rights reserved. +* +* +* THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY +* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" + import os import sys import getopt import logging -from ldif import LDIFParser, LDIFWriter -import string +from ldif import LDIFParser, LDIFWriter +import string """ @@ -39,193 +39,192 @@ """ - -# The main issue with turning an LDIF into a CSV are multivalued attributes -# The first problem is figuring out if an attribute is multivalued. If it is, you have -# no way of knowing how many values it may have. This poses a problem as with CSVs you -# can only have a single column. - -# My solution to this is to parse through the entire LDIF file twice. The first pass -# figures out how many columns you will need to ensure a full extraction of the data. -# The second pass actually outputs the CSV. Obviously this is 2*O(n). - -# One of the issues with this is that a lot of spreadsheet programs will only support -# a maximum number of columns. Suppose a multivalued attribute had 200 or so values. -# This would eat up 200 columns. OpenOffice's maximum number of columns is 1024. - + +# The main issue with turning an LDIF into a CSV are multivalued attributes +# The first problem is figuring out if an attribute is multivalued. If it is, you have +# no way of knowing how many values it may have. This poses a problem as with CSVs you +# can only have a single column. + +# My solution to this is to parse through the entire LDIF file twice. The first pass +# figures out how many columns you will need to ensure a full extraction of the data. +# The second pass actually outputs the CSV. Obviously this is 2*O(n). + +# One of the issues with this is that a lot of spreadsheet programs will only support +# a maximum number of columns. Suppose a multivalued attribute had 200 or so values. +# This would eat up 200 columns. OpenOffice's maximum number of columns is 1024. + + # A handler that simply throws away any logging messages sent to it class NullHandler(logging.Handler): def emit(self,record): pass - -# This class handles reading the attributes and storing them into a list. It is used -# for the first pass of the LDIF file -class LDIFAttributeParser(LDIFParser): - - attributeDictionary = dict() + +# This class handles reading the attributes and storing them into a list. It is used +# for the first pass of the LDIF file +class LDIFAttributeParser(LDIFParser): + + attributeDictionary = dict() def __init__(self, input): LDIFParser.__init__(self, input) self.attributeDictionary = dict() - - # This function is called whenever an entry is parsed out - def handle(self, dn, entry): + + # This function is called whenever an entry is parsed out + def handle(self, dn, entry): # Always add the dn attribute with cardinality 1 self.attributeDictionary["dn"] = 1 - - # Loop through each of the attribute names - for attributeName in entry.keys(): - - # Add the name to the dictionary if it is not already there. Set the value to the cardinality of the - # of the attribute (the number of values that the attribute has) - if( attributeName not in self.attributeDictionary ): - self.attributeDictionary[attributeName] = len(entry[attributeName]) - - # If the attribute name is already in the dictionary, update the cardinality if it is bigger than the - # one I can currently have stored - else: - if( len(entry[attributeName]) > self.attributeDictionary[attributeName] ): - self.attributeDictionary[attributeName] = len(entry[attributeName]) - - -class LDIFCSVParser(LDIFParser): - - attributeDictionary = dict() - attributeList = [] + + # Loop through each of the attribute names + for attributeName in entry.keys(): + + # Add the name to the dictionary if it is not already there. Set the value to the cardinality of the + # of the attribute (the number of values that the attribute has) + if( attributeName not in self.attributeDictionary ): + self.attributeDictionary[attributeName] = len(entry[attributeName]) + + # If the attribute name is already in the dictionary, update the cardinality if it is bigger than the + # one I can currently have stored + else: + if( len(entry[attributeName]) > self.attributeDictionary[attributeName] ): + self.attributeDictionary[attributeName] = len(entry[attributeName]) + + +class LDIFCSVParser(LDIFParser): + + attributeDictionary = dict() + attributeList = [] fieldSeparatorCharacter = "," textDelimiter = "\"" maximumColumns = 5 defaultOutput = sys.stdout - - def __init__(self, input, attributeDictionary, output): - LDIFParser.__init__(self, input) + + def __init__(self, input, attributeDictionary, output): + LDIFParser.__init__(self, input) self.attributeDictionary = attributeDictionary - self.defaultOutput = output - - # This function is called whenever an entry is parsed out - def handle(self, dn, entry): - - # Get a list of all the attributes in the entire LDIF and sort them - allAttributeNames = self.attributeDictionary.keys() - allAttributeNames.sort() - - # Loop through each of the attributes - for attributeName in allAttributeNames: - - # If the attribute is present in the entry print up to a maximum of - # maximumColumns or self.attributeDictionary[attributeName] - # Whichever is larger - numberOfTimesToPrint = self.attributeDictionary[attributeName] - - # This will result in a truncation of the data - if( numberOfTimesToPrint > self.maximumColumns ): - numberOfTimesToPrint = self.maximumColumns - - if( attributeName in entry ): - i = 0 - while( i < numberOfTimesToPrint ): - - if( i < len(entry[attributeName])): - - if( self.check_printable(entry[attributeName][i]) ): - self.defaultOutput.write(self.textDelimiter + entry[attributeName][i] + self.textDelimiter + self.fieldSeparatorCharacter) - else: - self.defaultOutput.write(self.textDelimiter + repr(entry[attributeName][i]) + self.textDelimiter + self.fieldSeparatorCharacter) - else: - self.defaultOutput.write(self.textDelimiter + self.textDelimiter + self.fieldSeparatorCharacter) - - i = i + 1 + self.defaultOutput = output + + # This function is called whenever an entry is parsed out + def handle(self, dn, entry): + + # Get a list of all the attributes in the entire LDIF and sort them + allAttributeNames = list(self.attributeDictionary.keys()) + allAttributeNames.sort() + + # Loop through each of the attributes + for attributeName in allAttributeNames: + + # If the attribute is present in the entry print up to a maximum of + # maximumColumns or self.attributeDictionary[attributeName] + # Whichever is larger + numberOfTimesToPrint = self.attributeDictionary[attributeName] + + # This will result in a truncation of the data + if( numberOfTimesToPrint > self.maximumColumns ): + numberOfTimesToPrint = self.maximumColumns + + if( attributeName in entry ): + i = 0 + while( i < numberOfTimesToPrint ): + + if( i < len(entry[attributeName])): + + if( self.check_printable(entry[attributeName][i]) ): + self.defaultOutput.write(self.textDelimiter + entry[attributeName][i] + self.textDelimiter + self.fieldSeparatorCharacter) + else: + self.defaultOutput.write(self.textDelimiter + repr(entry[attributeName][i]) + self.textDelimiter + self.fieldSeparatorCharacter) + else: + self.defaultOutput.write(self.textDelimiter + self.textDelimiter + self.fieldSeparatorCharacter) + + i = i + 1 # If the attribute name is dn, print the fully qualified distinguished name elif(attributeName == "dn"): self.defaultOutput.write(self.textDelimiter + str(dn) + self.textDelimiter + self.fieldSeparatorCharacter) - - # If the attribute name is not in the entry print fieldSeparatorCharacter(s) - else: - i = 0 - while( i < numberOfTimesToPrint ): - self.defaultOutput.write(self.textDelimiter + self.textDelimiter + self.fieldSeparatorCharacter) - i = i + 1 - - # Print a newline - self.defaultOutput.write("\n") - - def check_printable(self, message): - for char in message: - if (ord(char) > 126 or ord(char) < 32): - return False - return True - + + # If the attribute name is not in the entry print fieldSeparatorCharacter(s) + else: + i = 0 + while( i < numberOfTimesToPrint ): + self.defaultOutput.write(self.textDelimiter + self.textDelimiter + self.fieldSeparatorCharacter) + i = i + 1 + + # Print a newline + self.defaultOutput.write("\n") + + def check_printable(self, message): + for char in message: + if (ord(char) > 126 or ord(char) < 32): + return False + return True + # Parses an LDIF file to find out all the attribute names as well as how many of each kind of attribute # are in the file. Returns a dictionary of attributes and the maximum number of times that value appears. def parseLDIFAttributes(filename): # Open the LDIF file for reading - LDIFFile = open(filename,"rb") - primaryLogger.debug("Opened <%s> for reading" % filename) - - # Create an instance of the attribute parser which will handle LDIF entries - attributeParser = LDIFAttributeParser(LDIFFile) - - # Perform the actual parsing using the AttributeParser - # This first pass is only to obtain the attributes - primaryLogger.debug("Parsing <%s> for attributes" % filename) - attributeParser.parse() - - # Close the file + LDIFFile = open(filename,"rb") + primaryLogger.debug("Opened <%s> for reading" % filename) + + # Create an instance of the attribute parser which will handle LDIF entries + attributeParser = LDIFAttributeParser(LDIFFile) + + # Perform the actual parsing using the AttributeParser + # This first pass is only to obtain the attributes + primaryLogger.debug("Parsing <%s> for attributes" % filename) + attributeParser.parse() + + # Close the file LDIFFile.close() - primaryLogger.debug("Closed file <%s>" % filename) + primaryLogger.debug("Closed file <%s>" % filename) # Return the dictionary of attributes. The key is the attribute name. The value is the # maximum number of times that value appears - return attributeParser.attributeDictionary + return attributeParser.attributeDictionary - - - def generateCSV(attributeDictionary, filename, output, fieldSeparatorCharacter = ",", textDelimiter = "\"", maximumColumns = 5 ): # Open the LDIF file for reading - LDIFFile = open(filename,"rb") - primaryLogger.debug("Opened <%s> for reading" % filename) - - # Create an instance of the attribute parser which will handle LDIF entries - CSVParser = LDIFCSVParser(LDIFFile,attributeDictionary,output) + LDIFFile = open(filename,"rb") + primaryLogger.debug("Opened <%s> for reading" % filename) + + # Create an instance of the attribute parser which will handle LDIF entries + CSVParser = LDIFCSVParser(LDIFFile,attributeDictionary,output) CSVParser.fieldSeparatorCharacter = fieldSeparatorCharacter CSVParser.textDelimiter = textDelimiter CSVParser.maximumColumns = maximumColumns - - # Print out the CSV header sorted - headerValues = attributeDictionary.keys() - headerValues.sort() - - # Count of the number of columns this CSV will have - numberOfColumns = 0 - - for columnName in headerValues: - numberOfTimesToPrint = attributeDictionary[columnName] - - # This will result in a truncation of the data - if( numberOfTimesToPrint > maximumColumns ): - numberOfTimesToPrint = maximumColumns - - i = 0 - while(i < numberOfTimesToPrint): - output.write(textDelimiter + columnName + textDelimiter + fieldSeparatorCharacter) - numberOfColumns = numberOfColumns + 1 - i = i + 1 - - # Write a newline after the header - output.write("\n") - - # Print out the main CSV data + + # Print out the CSV header sorted + headerValues = list(attributeDictionary.keys()) + headerValues.sort() + + # Count of the number of columns this CSV will have + numberOfColumns = 0 + + for columnName in headerValues: + numberOfTimesToPrint = attributeDictionary[columnName] + + # This will result in a truncation of the data + if( numberOfTimesToPrint > maximumColumns ): + numberOfTimesToPrint = maximumColumns + + i = 0 + while(i < numberOfTimesToPrint): + output.write(textDelimiter + columnName + textDelimiter + fieldSeparatorCharacter) + numberOfColumns = numberOfColumns + 1 + i = i + 1 + + # Write a newline after the header + output.write("\n") + + # Print out the main CSV data CSVParser.parse() - # Write a newline to end the file + # Write a newline to end the file output.write("\n") + LDIFFile.close() def setupLogging(logfilename=""): # Create the primaryLogger as a global variable @@ -247,24 +246,24 @@ def setupLogging(logfilename=""): # Text to describe out this command is used def usage(): - usage = """ - usage: LDIFtoCSV.py [options] - - -o : File to write output. By default this is set to sys.stdout - -l : File to write logging output. By default there is no logging. - -F : Character to separate the fields by. By default this is a - comma. i.e. -F"," - -D : Character to delimit the text value by. By default this is a - double quote. i.e. -D"\"" - -M : The maximum number of columns a multivalued attribute should - take up (default: 5). This is common with the objectClass - attribute where it can have over 20 values. Do you want to - have 20 columns each with the same heading objectClass or - do you want to limit it. - """ - sys.stdout.write(usage) - - """ + usage = """ + usage: LDIFtoCSV.py [options] + + -o : File to write output. By default this is set to sys.stdout + -l : File to write logging output. By default there is no logging. + -F : Character to separate the fields by. By default this is a + comma. i.e. -F"," + -D : Character to delimit the text value by. By default this is a + double quote. i.e. -D"\"" + -M : The maximum number of columns a multivalued attribute should + take up (default: 5). This is common with the objectClass + attribute where it can have over 20 values. Do you want to + have 20 columns each with the same heading objectClass or + do you want to limit it. + """ + sys.stdout.write(usage) + + """ sys.stdout.write("usage: LDIFtoCSV.py [options] \n") sys.stdout.write("-o \t: File to write output. By default this is set to sys.stdout\n") sys.stdout.write("-l \t: File to write logging output. By default there is no logging.\n") @@ -275,7 +274,8 @@ def usage(): sys.stdout.write("\t\t Do you want to have 20 columns each with the same heading objectClass or do you want to limit it.\n") sys.stdout.write("\n") - """ + + """ # Primary function call def main(): @@ -292,13 +292,13 @@ def main(): # Use getopt to get all the options that might be present try: optionValueList, remainingItems = getopt.getopt(sys.argv[1:], "o:l:F:D:M:") - except getopt.GetoptError, exceptionObject: - print str(exceptionObject) + except getopt.GetoptError as exceptionObject: + print(str(exceptionObject)) usage() sys.exit(2) if( len(remainingItems) < 1 ): - print "Error: Expecting single filename argument at end of command line.\n" + print("Error: Expecting single filename argument at end of command line.\n") usage() sys.exit(2) @@ -333,8 +333,8 @@ def main(): primaryLogger.debug("maximumColumns: %d" % maximumColumns) # First pass obtains the attributes inside the LDIF - attributeDictionary = parseLDIFAttributes(remainingItems[0]) - primaryLogger.debug("Parsed attribute dictionary: " + repr(attributeDictionary)) + attributeDictionary = parseLDIFAttributes(remainingItems[0]) + primaryLogger.debug("Parsed attribute dictionary: " + repr(attributeDictionary)) # Default output is stdout output = sys.stdout @@ -347,11 +347,11 @@ def main(): # Close the file output.close() - -# Main entry point of program -if( __name__ == '__main__'): - main() - - +# Main entry point of program +if( __name__ == '__main__'): + main() + + + diff --git a/LDIFtoCSVUnitTest.py b/LDIFtoCSVUnitTest.py index c02eece..17af882 100644 --- a/LDIFtoCSVUnitTest.py +++ b/LDIFtoCSVUnitTest.py @@ -9,7 +9,7 @@ class LDIFAttributeChecks(unittest.TestCase): def testBasic(self): attributeDictionary = LDIFtoCSV.parseLDIFAttributes(os.path.join (self.sampleLDIFLocation, "Root.ldif")) - print "Parsed attribute dictionary: " + repr(attributeDictionary) + print("Parsed attribute dictionary: " + repr(attributeDictionary)) self.assertEqual(4, len(attributeDictionary)) @@ -37,7 +37,7 @@ def testThreeEntries(self): self.assertEqual(2, attributeDictionary["cn"]) self.assertEqual(1, attributeDictionary["sn"]) - print "Parsed attribute dictionary: " + repr(attributeDictionary) + print("Parsed attribute dictionary: " + repr(attributeDictionary)) if __name__ == "__main__": diff --git a/ldif.py b/ldif.py index 1a68d14..300fbb0 100644 --- a/ldif.py +++ b/ldif.py @@ -23,12 +23,11 @@ 'LDIFCopy', ] -import urlparse,urllib,base64,re,types - -try: - from cStringIO import StringIO -except ImportError: - from StringIO import StringIO +import urllib +import base64 +import re +import types +import io attrtype_pattern = r'[\w;.]+(;[\w_-]+)*' attrvalue_pattern = r'(([^,]|\\,)+|".*?")' @@ -134,7 +133,7 @@ def _unparseAttrTypeandValue(self,attr_type,attr_value): attr_value attribute value """ - if self._base64_attrs.has_key(attr_type.lower()) or \ + if attr_type.lower() in self._base64_attrs or \ needs_base64(attr_value): # Encode with base64 self._unfoldLDIFLine(':: '.join([attr_type,base64.encodestring(attr_value).replace('\n','')])) @@ -164,7 +163,7 @@ def _unparseChangeRecord(self,modlist): elif mod_len==3: changetype = 'modify' else: - raise ValueError,"modlist item of wrong length" + raise ValueError("modlist item of wrong length") self._unparseAttrTypeandValue('changetype',changetype) for mod in modlist: if mod_len==2: @@ -173,7 +172,7 @@ def _unparseChangeRecord(self,modlist): mod_op,mod_type,mod_vals = mod self._unparseAttrTypeandValue(MOD_OP_STR[mod_op],mod_type) else: - raise ValueError,"Subsequent modlist item of wrong length" + raise ValueError("Subsequent modlist item of wrong length") if mod_vals: for mod_val in mod_vals: self._unparseAttrTypeandValue(mod_type,mod_val) @@ -199,7 +198,7 @@ def unparse(self,dn,record): elif isinstance(record,types.ListType): self._unparseChangeRecord(record) else: - raise ValueError, "Argument record must be dictionary or list" + raise ValueError("Argument record must be dictionary or list") # Write empty line separating the records self._output_file.write(self._line_sep) # Count records written @@ -223,7 +222,7 @@ def CreateLDIF(dn,record,base64_attrs=None,cols=76): Specifies how many columns a line may have before it's folded into many lines. """ - f = StringIO() + f = io.BytesIO() ldif_writer = LDIFWriter(f,base64_attrs,cols,'\n') ldif_writer.unparse(dn,record) s = f.getvalue() @@ -293,8 +292,12 @@ def _unfoldLDIFLine(self): """ Unfold several folded lines with trailing space into one line """ - unfolded_lines = [ self._stripLineSep(self._line) ] - self._line = self._input_file.readline() + # do we have strings or bytes? + try: + unfolded_lines = [ self._stripLineSep(str(self._line, 'utf-8')) ] + except TypeError: + unfolded_lines = [ self._stripLineSep(self._line) ] + self._line = self._input_file.readline().decode('utf-8') while self._line and self._line[0]==' ': unfolded_lines.append(self._stripLineSep(self._line[1:])) self._line = self._input_file.readline() @@ -329,8 +332,8 @@ def _parseAttrTypeandValue(self): url = unfolded_line[colon_pos+2:].strip() attr_value = None if self._process_url_schemes: - u = urlparse.urlparse(url) - if self._process_url_schemes.has_key(u[0]): + u = urllib.parse.urlparse(url) + if u[0] in self._process_url_schemes: attr_value = urllib.urlopen(url).read() elif value_spec==':\r\n' or value_spec=='\n': attr_value = '' @@ -356,25 +359,24 @@ def parse(self): if attr_type=='dn': # attr type and value pair was DN of LDIF record if dn!=None: - raise ValueError, 'Two lines starting with dn: in one record.' + raise ValueError('Two lines starting with dn: in one record.') if not is_dn(attr_value): - raise ValueError, 'No valid string-representation of distinguished name %s.' % (repr(attr_value)) + raise ValueError('No valid string-representation of distinguished name %s.' % (repr(attr_value))) dn = attr_value elif attr_type=='version' and dn is None: version = 1 elif attr_type=='changetype': # attr type and value pair was DN of LDIF record if dn is None: - raise ValueError, 'Read changetype: before getting valid dn: line.' + raise ValueError('Read changetype: before getting valid dn: line.') if changetype!=None: - raise ValueError, 'Two lines starting with changetype: in one record.' - if not valid_changetype_dict.has_key(attr_value): - raise ValueError, 'changetype value %s is invalid.' % (repr(attr_value)) + raise ValueError('Two lines starting with changetype: in one record.') + if attr_value not in valid_changetype_dict: + raise ValueError('changetype value %s is invalid.' % (repr(attr_value))) changetype = attr_value - elif attr_value!=None and \ - not self._ignored_attr_types.has_key(attr_type.lower()): + elif attr_value!=None and attr_type.lower() not in self._ignored_attr_types: # Add the attribute to the entry if not ignored attribute - if entry.has_key(attr_type): + if attr_type in entry: entry[attr_type].append(attr_value) else: entry[attr_type]=[attr_value]