Returns:
A list of string that the text has been cut into.
chunkList = [] // The list of the chunks (a.k.a. a list of list of strings)
chunkSoFar = Queue() // The rolling window representing the (potential) chunk
currChunkSize = 0 // Index keeping track of whether or not it"s time to make a chunk out of the window
tillNextChunk = chunkSize - overlap // The distance between the starts of chunks
splitText = text.split("\n")
// Create list of chunks (chunks are lists of words and whitespace) by using a queue as a rolling window
for token in splitText:
if token == "":
chunkSoFar.put(token)
else:
currChunkSize += 1
if currChunkSize > chunkSize:
chunkList.append(list(chunkSoFar.queue))
stripLeadingLines(lineQueue=chunkSoFar, numLines=tillNextChunk)
currChunkSize -= tillNextChunk
chunkSoFar.put(token)
// Making sure the last chunk is of a sufficient proportion
lastChunk = list(chunkSoFar.queue) // Grab the final (partial) chunk
if (float(countWords(lastChunk)) / chunkSize) < lastProp: // If the proportion of the last chunk is too low
if len(chunkList)==0:
chunkList.extend(lastChunk)
else:
chunkList[-1].extend(lastChunk)
else:
chunkList.append(lastChunk)
// Make the list of lists of strings into a list of strings
countSubList = 0stringList=[]for subList in chunkList:
stringList.extend(["".join(subList)])
if type(subList) is ListType:
countSubList+=1
// Prevent there isn"t subList inside chunkList
if countSubList==0:
stringList = []
stringList.extend(["".join(chunkList)])
return stringList
def cutByNumber(text, numChunks):