Ver Mensaje Individual
  #3 (permalink)  
Antiguo 12/03/2010, 07:15
jlgarcia1977
 
Fecha de Ingreso: octubre-2008
Ubicación: Madrid
Mensajes: 352
Antigüedad: 15 años, 7 meses
Puntos: 5
Respuesta: Extraer texto DOCX

import java.io.FileInputStream;
import java.io.IOException;

import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.POITextExtractor;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;


public class PruebaDocx {

/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
FileInputStream fis = new FileInputStream("D:\\Depeche.docx");
POIFSFileSystem fileSystem = new POIFSFileSystem(fis);
// Firstly, get an extractor for the Workbook
POIOLE2TextExtractor oleTextExtractor =
ExtractorFactory.createExtractor(fileSystem);
// Then a List of extractors for any embedded Excel, Word, PowerPoint
// or Visio objects embedded into it.
POITextExtractor[] embeddedExtractors =
ExtractorFactory.getEmbededDocsTextExtractors(oleT extExtractor);
for (POITextExtractor textExtractor : embeddedExtractors) {
// If the embedded object was an Excel spreadsheet.
if (textExtractor instanceof ExcelExtractor) {
ExcelExtractor excelExtractor = (ExcelExtractor) textExtractor;
System.out.println(excelExtractor.getText());
}
// A Word Document
else if (textExtractor instanceof WordExtractor) {
WordExtractor wordExtractor = (WordExtractor) textExtractor;
String[] paragraphText = wordExtractor.getParagraphText();
for (String paragraph : paragraphText) {
System.out.println(paragraph);
}
// Display the document's header and footer text
System.out.println("Footer text: " + wordExtractor.getFooterText());
System.out.println("Header text: " + wordExtractor.getHeaderText());
}
// PowerPoint Presentation.
else if (textExtractor instanceof PowerPointExtractor) {
PowerPointExtractor powerPointExtractor =
(PowerPointExtractor) textExtractor;
System.out.println("Text: " + powerPointExtractor.getText());
System.out.println("Notes: " + powerPointExtractor.getNotes());
}
// Visio Drawing
else if (textExtractor instanceof VisioTextExtractor) {
VisioTextExtractor visioTextExtractor =
(VisioTextExtractor) textExtractor;
System.out.println("Text: " + visioTextExtractor.getText());
}
}
}

}


Me da el siguiente error al ejecutar.

Exception in thread "main" org.apache.poi.poifs.filesystem.OfficeXmlFileExcep tion: The supplied data appears to be in the Office 2007+ XML. You are calling the part of POI that deals with OLE2 Office Documents. You need to call a different part of POI to process this data (eg XSSF instead of HSSF)
at org.apache.poi.poifs.storage.HeaderBlockReader.<in it>(HeaderBlockReader.java:98)
at org.apache.poi.poifs.filesystem.POIFSFileSystem.<i nit>(POIFSFileSystem.java:151)
at PruebaDocx.main(PruebaDocx.java:23)