From 85f3d92c18ec1cce217f4dbeb3ad6619505be1a3 Mon Sep 17 00:00:00 2001 From: john Date: Fri, 27 Sep 2002 09:08:26 +0000 Subject: [PATCH] fix the text splitting routines so that we only break sections on word boundaries not only do we not have to learn how to hyphenate in a correctly localized manner, but we also avoid the problem encountered of splitting in the middle of """, which the xml parser just *loved* --- .../producer/PDFPreFormattingProducerNode.java | 30 +++++++++++++--------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/source/mircoders/producer/PDFPreFormattingProducerNode.java b/source/mircoders/producer/PDFPreFormattingProducerNode.java index 44f657a9..43dd3c50 100755 --- a/source/mircoders/producer/PDFPreFormattingProducerNode.java +++ b/source/mircoders/producer/PDFPreFormattingProducerNode.java @@ -97,25 +97,28 @@ public class PDFPreFormattingProducerNode implements ProducerNode { EntityList images=DatabaseContentToMedia.getInstance().getImages((EntityContent)entity); + String theContent = ((EntityContent) entity).getValue("content_data"); if (images == null){ HashMap row = new HashMap(); - row.put("text",((EntityContent) entity).getValue("content_data")); + row.put("text",theContent); row.put("hasImage","0"); brokenUpContent.add(row); } if (images != null){ //need to add checks for out of content! HashMap row0 = new HashMap(); - if (numCharsInAnImagelessRow>(((EntityContent) entity).getValue("content_data")).length()){ - row0.put("text",((EntityContent) entity).getValue("content_data")); + if (numCharsInAnImagelessRow>(theContent).length()){ + row0.put("text",theContent); outOfText = true; } else { - row0.put("text",((EntityContent) entity).getValue("content_data").substring(0,numCharsInAnImagelessRow)); + //break on words so we don't split html entities + int lastSpaceAt = theContent.lastIndexOf(" ",numCharsInAnImagelessRow); + row0.put("text",theContent.substring(0,lastSpaceAt)); + currentPosition=lastSpaceAt; } row0.put("hasImage","0"); brokenUpContent.add(row0); - currentPosition=numCharsInAnImagelessRow; aLogger.println("CP1 is "+ currentPosition); while(images.hasNext()){ HashMap row1 = new HashMap(); @@ -150,34 +153,37 @@ public class PDFPreFormattingProducerNode implements ProducerNode { row1.put("hasImage","1"); if (! outOfText){ try { - row1.put("text",((EntityContent) entity).getValue("content_data").substring(currentPosition,currentPosition+text_amount)); + int lastSpaceAt = theContent.lastIndexOf(" ",currentPosition+text_amount); + row1.put("text",theContent.substring(currentPosition,lastSpaceAt)); + currentPosition=lastSpaceAt; } catch (IndexOutOfBoundsException e){ - row1.put("text",((EntityContent) entity).getValue("content_data").substring(currentPosition)); + row1.put("text",theContent.substring(currentPosition)); outOfText = true; } } - currentPosition=currentPosition+text_amount; aLogger.println("CP2 is "+ currentPosition); brokenUpContent.add(row1); if (! outOfText){ try { - row2.put("text",((EntityContent) entity).getValue("content_data").substring(currentPosition,currentPosition+numCharsInAnImagelessRow)); + int lastSpaceAt = theContent.lastIndexOf(" ",currentPosition+numCharsInAnImagelessRow); + row2.put("text",theContent.substring(currentPosition,lastSpaceAt)); + currentPosition=lastSpaceAt; } catch (IndexOutOfBoundsException e){ - row2.put("text",((EntityContent) entity).getValue("content_data").substring(currentPosition)); + row2.put("text",theContent.substring(currentPosition)); outOfText = true; } } row2.put("hasImage","0"); brokenUpContent.add(row2); - currentPosition=currentPosition+numCharsInAnImagelessRow; + aLogger.println("CP3 is "+ currentPosition); } HashMap row3 = new HashMap(); if (! outOfText){ - row3.put("text",((EntityContent) entity).getValue("content_data").substring(currentPosition)); + row3.put("text",theContent.substring(currentPosition)); row3.put("hasImage","0"); brokenUpContent.add(row3); } -- 2.11.0