|
9 | 9 | from matplotlib.ticker import MultipleLocator
|
10 | 10 | from tqdm import tqdm
|
11 | 11 | from scipy import sparse
|
12 |
| -from Bio import SeqIO |
13 |
| -from sklearn.preprocessing import MultiLabelBinarizer |
14 | 12 | from sklearn import metrics as metrics
|
15 |
| -import pandas as pd |
16 | 13 |
|
17 | 14 | def getBigwigFileList(pDirectory):
|
18 | 15 | #returns a list of bigwig files in pDirectory
|
@@ -405,42 +402,6 @@ def clampArray(pArray):
|
405 | 402 | clampedArray[clampedArray > upperClampingBound] = upperClampingBound
|
406 | 403 | return clampedArray
|
407 | 404 |
|
408 |
| -def encodeSequence(pSequenceStr, pClasses=None): |
409 |
| - #one-hot encoding for DNA sequences |
410 |
| - if pSequenceStr is None or pSequenceStr == "": |
411 |
| - msg = "Aborting. DNA sequence is empty" |
412 |
| - raise SystemExit(msg) |
413 |
| - mlb = MultiLabelBinarizer(classes=pClasses) |
414 |
| - encodedSequenceArray = mlb.fit_transform(pSequenceStr).astype("uint8") |
415 |
| - if encodedSequenceArray.shape[1] != 4: |
416 |
| - msg = "Warning: DNA sequence contains more than the 4 nucleotide symbols A,C,G,T\n" |
417 |
| - msg += "Check your input sequence, if this is not intended." |
418 |
| - print(msg) |
419 |
| - print("Contained symbols:", ", ".join(mlb.classes_)) |
420 |
| - return encodedSequenceArray |
421 |
| - |
422 |
| -def fillEncodedSequence(pEncodedSequenceArray, pBinSizeInt): |
423 |
| - #fill one-hot encoded sequence array with zero vectors such that |
424 |
| - #the length matches the number of bins |
425 |
| - if pBinSizeInt is None or not isinstance(pBinSizeInt, int): |
426 |
| - return |
427 |
| - actualLengthInt = pEncodedSequenceArray.shape[0] #here, length in basepairs |
428 |
| - targetLengthInt = int(np.ceil(actualLengthInt/pBinSizeInt))*pBinSizeInt #in basepairs |
429 |
| - returnArray = None |
430 |
| - if targetLengthInt > actualLengthInt: |
431 |
| - #append zero vectors to the array to fill the last bin |
432 |
| - #in case the chromosome length is not divisible by bin size (as is normal) |
433 |
| - toAppendArray = np.zeros((targetLengthInt-actualLengthInt,pEncodedSequenceArray.shape[1]),dtype="uint8") |
434 |
| - returnArray = np.append(pEncodedSequenceArray,toAppendArray,axis=0) |
435 |
| - else: |
436 |
| - msg = "Warning: could not append zeros to end of array.\n" |
437 |
| - msg += "Target length {:d}, actual length {:d}\n" |
438 |
| - msg += "Array left unchanged." |
439 |
| - msg = msg.format(targetLengthInt, actualLengthInt) |
440 |
| - print(msg) |
441 |
| - returnArray = pEncodedSequenceArray |
442 |
| - return returnArray |
443 |
| - |
444 | 405 | def computePearsonCorrelation(pCoolerFile1, pCoolerFile2,
|
445 | 406 | pWindowsize_bp,
|
446 | 407 | pModelChromList, pTargetChromStr,
|
|
0 commit comments