【问题标题】:How to convert .docx to .txt or get text from docx file in android?如何在 android 中将 .docx 转换为 .txt 或从 docx 文件中获取文本?
【发布时间】:2019-07-18 04:25:17
【问题描述】:

我正在使用 OneDrive sdk 从 OneDrive 下载 .docx 文件。下载成功,但是我需要转换成.txt格式,我做不到。

有人知道如何在 android 中转换或获取 .docx 文件中的文本吗?

我可以得到 .docx 文件的InputStream

这是从 OneDrive 下载文件的代码

InputStream inputStream = iOneDriveClient.getDrive().getItems(fileID).getContent().buildRequest().get();
OutputStream out = new FileOutputStream(mPath);
int read;
byte[] bytes = new byte[1024];
while ((read = inputStream.read(bytes)) != -1) {
    out.write(bytes, 0, read);
}
out.flush();
out.close();
inputStream.close();

此代码已经在doInBackground

编辑

我已经添加了 Apache POI 库,但我无法编译它

我在很多文件上遇到冲突

这是我的build.gradle

apply plugin: 'com.android.application'
apply plugin: 'io.fabric'

android {
    compileSdkVersion myCompileSdkVersion
    compileOptions.encoding = 'windows-1251'

    defaultConfig {
        applicationId "com.my.app"
        versionCode 8
        versionName "1.0.5"

        minSdkVersion myMinSdkVersion
        targetSdkVersion myTargetSdkVersion
        vectorDrawables.useSupportLibrary = true

        multiDexEnabled true
        testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner"

        ndk {
            moduleName "app"
            abiFilters "armeabi-v7a"/*, "x86"*/

        }
    }

    // Specifies one flavor dimension.
    flavorDimensions "common"
    productFlavors {
        live {
            dimension "common"
            buildConfigField 'boolean', 'IS_CUSOTM_APP', 'false'
        }

        custom {
            dimension "common"
            buildConfigField 'boolean', 'IS_CUSOTM_APP', 'true'
        }
    }

    sourceSets.main {
        res.srcDirs = ['src/main/res']
        jni.srcDirs = []
        jniLibs.srcDirs = ['src/main/jni']

    }

    task ndkBuild(type: Exec, description: 'Compile JNI source via NDK') {
        def ndkDir = android.ndkDirectory
        commandLine "$ndkDir/ndk-build",
                'NDK_PROJECT_PATH=build/intermediates/ndk',
                'NDK_LIBS_OUT=src/main/jniLibs',
                'APP_BUILD_SCRIPT=src/main/jni/Android.mk',
                'NDK_APPLICATION_MK=src/main/jni/Application.mk'
    }

    tasks.withType(JavaCompile) {
        compileTask -> compileTask.dependsOn ndkBuild
    }

    buildTypes {
        release {
            minifyEnabled false
            proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro'

            lintOptions {
                checkReleaseBuilds false
                // Or, if you prefer, you can continue to check for errors in release builds,
                // but continue the build even when errors are found:
                abortOnError false
                //disable 'MissingTranslation'

                checkReleaseBuilds false
                // Or, if you prefer, you can continue to check for errors in release builds,
                // but continue the build even when errors are found:
                abortOnError false
            }
            debug {
                debuggable = true
                jniDebuggable true

            }
            signingConfig signingConfigs.config
        }
    }

    dexOptions {
        preDexLibraries false
        javaMaxHeapSize "4g"
    }

    packagingOptions {
        exclude 'META-INF/DEPENDENCIES'
        exclude 'META-INF/NOTICE'
        exclude 'META-INF/LICENSE'
        exclude 'META-INF/LICENSE.txt'
        exclude 'META-INF/NOTICE.txt'
    }

    configurations.all {
        resolutionStrategy.force 'com.google.code.findbugs:jsr305:1.3.9'
    }


    externalNativeBuild {

        // Encapsulates your CMake build configurations.
        ndkBuild {
            // Provides a relative path to your to the Android.mk build script.
            path "src/main/jni/Android.mk"
        }
    }

    dataBinding {
        enabled = true
    }


    compileOptions {
        sourceCompatibility JavaVersion.VERSION_1_8
        targetCompatibility JavaVersion.VERSION_1_8
    }
}

dependencies {
    implementation 'androidx.constraintlayout:constraintlayout:1.1.3'

    implementation 'com.google.android.material:material:1.0.0-rc01'

    implementation "commons-io:commons-io:2.4"
    implementation "org.apache.commons:commons-lang3:3.5"

    implementation 'com.jakewharton:butterknife:10.1.0'
    annotationProcessor 'com.jakewharton:butterknife-compiler:10.1.0'
    implementation "com.google.code.gson:gson:2.8.0"

    implementation fileTree(include: ['*.jar'], dir: 'libs')

    implementation('com.google.api-client:google-api-client-android:1.28.0') {
        exclude group: 'org.apache.httpcomponents'
    }

    implementation('com.google.apis:google-api-services-drive:v3-rev136-1.25.0') {
        exclude group: 'org.apache.httpcomponents'
    }

    implementation 'androidx.multidex:multidex:2.0.1'
    //main libs
    implementation 'androidx.cardview:cardview:1.0.0'

    //google
    implementation 'com.google.android.gms:play-services-drive:17.0.0'
    implementation 'com.google.android.gms:play-services-auth:17.0.0'
    implementation 'com.google.firebase:firebase-core:17.0.0'
    implementation 'com.google.code.gson:gson:2.8.5'
    implementation 'com.crashlytics.sdk.android:crashlytics:2.10.1'
    implementation 'com.google.android.exoplayer:exoplayer-core:2.10.0'

    //for M+ permission handling
    implementation 'pub.devrel:easypermissions:1.2.0'

    //for downloading zip
    implementation 'com.mani:ThinDownloadManager:1.4.0'

    //for bottomseet dialog
    implementation 'com.orhanobut:dialogplus:1.11@aar'

    //for crash report
    implementation 'me.drakeet.library:crashwoodpecker:2.1.1'
    testImplementation 'junit:junit:4.13-beta-2'

    // bouncy castle
    implementation 'org.bouncycastle:bcprov-jdk15on:1.61'

    //for speech to text
    implementation 'net.gotev:speech:1.3.1'

    // Add Dagger dependencies
    implementation 'com.google.dagger:dagger:2.16'
    annotationProcessor 'com.google.dagger:dagger-compiler:2.16'

    // Add Dagger Android dependencies
    implementation 'com.google.dagger:dagger-android:2.16'
    implementation 'com.google.dagger:dagger-android-support:2.16'
    // if you use the support libraries
    annotationProcessor 'com.google.dagger:dagger-android-processor:2.16'

    // Add RXAndroid
    implementation 'io.reactivex.rxjava2:rxandroid:2.1.1'
    implementation 'io.reactivex.rxjava2:rxjava:2.2.6'
    implementation 'com.squareup.okhttp3:logging-interceptor:3.9.0'

    // LiveData Support
    implementation 'androidx.lifecycle:lifecycle-extensions:2.0.0'
    annotationProcessor 'androidx.lifecycle:lifecycle-compiler:2.0.0'

    //Retrofit for API Call
    implementation 'com.squareup.retrofit2:retrofit:2.3.0'
    implementation 'com.squareup.retrofit2:converter-gson:2.3.0'
    implementation 'com.squareup.retrofit2:adapter-rxjava2:2.3.0'

    implementation 'id.zelory:compressor:2.1.0'
    implementation 'androidx.exifinterface:exifinterface:1.1.0-beta01'

    //one drive
    implementation('com.onedrive.sdk:onedrive-sdk-android:1.3+') {
        transitive = false
    }

    // Include supported authentication methods for your application
    implementation 'com.microsoft.services.msa:msa-auth:0.8.+'
    implementation 'com.microsoft.aad:adal:1.1.+'

    implementation 'org.apache.tika:tika-parsers:1.21'
}
apply plugin: 'com.google.gms.google-services'
apply from: "../artifacts.gradle"

冲突错误是

在模块 docx4j-6.1.1-SNAPSHOT-shaded.jar (docx4j-6.1.1-SNAPSHOT-shaded.jar) 和 jackson-core-2.9.6 中发现重复的类 com.fasterxml.jackson.core.Base64Variant。 jar (com.fasterxml.jackson.core:jackson-core:2.9.6)

【问题讨论】:

标签: android apache apache-tika


【解决方案1】:

您可以使用Apache POI

来自文档:

对于 Word 97 - Word 2003 中的 .doc 文件,暂存器中有 org.apache.poi.hwpf.extractor.WordExtractor,它将为您的文档返回文本。

这是来自 Docs 的示例:

FileInputStream fis = new FileInputStream(inputFile);
POIFSFileSystem fileSystem = new POIFSFileSystem(fis);
// Firstly, get an extractor for the Workbook
POIOLE2TextExtractor oleTextExtractor = 
   ExtractorFactory.createExtractor(fileSystem);
// Then a List of extractors for any embedded Excel, Word, PowerPoint
// or Visio objects embedded into it.
POITextExtractor[] embeddedExtractors =
   ExtractorFactory.getEmbededDocsTextExtractors(oleTextExtractor);
for (POITextExtractor textExtractor : embeddedExtractors) {
   // If the embedded object was an Excel spreadsheet.
   if (textExtractor instanceof ExcelExtractor) {
      ExcelExtractor excelExtractor = (ExcelExtractor) textExtractor;
      System.out.println(excelExtractor.getText());
   }
   // A Word Document
   else if (textExtractor instanceof WordExtractor) {
      WordExtractor wordExtractor = (WordExtractor) textExtractor;
      String[] paragraphText = wordExtractor.getParagraphText();
      for (String paragraph : paragraphText) {
         System.out.println(paragraph);
      }
      // Display the document's header and footer text
      System.out.println("Footer text: " + wordExtractor.getFooterText());
      System.out.println("Header text: " + wordExtractor.getHeaderText());
   }
   // PowerPoint Presentation.
   else if (textExtractor instanceof PowerPointExtractor) {
      PowerPointExtractor powerPointExtractor =
         (PowerPointExtractor) textExtractor;
      System.out.println("Text: " + powerPointExtractor.getText());
      System.out.println("Notes: " + powerPointExtractor.getNotes());
   }

}

【讨论】:

  • 对于 Apache POI,我必须使用 tika 库,对吗?implementation group:'org.apache.tika', name: 'tika-parsers', version: '1.21' 我也在使用其他 Apache 库,例如 org.apache.commons:commons-lang3:3.5commons-io:commons-io:2.4 我可以成功构建项目,但我遇到了一些冲突文件java.lang.RuntimeException: java.lang.RuntimeException:Duplicate class javax.activation.ActivationDataFlavor found in modules jakarta.activation-1.2.1.jar (com.sun.activation:jakarta.activation:1.2.1) and jakarta.activation-api-1.2.1.jar (jakarta.activation:jakarta.activation-api:1.2.1)
  • @Priyankagb 看到这个(stackoverflow.com/a/36991200/7360848) 所以帖子的所有答案。我猜,在手动将库添加到项目后,您必须手动删除重复的类。
  • 我解决了所有冲突,现在我在编译时遇到错误Caused by: com.android.builder.dexing.DexArchiveBuilderException: Failed to process /home/pc/.gradle/caches/modules-2/files-2.1/org.apache.poi/poi/4.1.0/3/poi-4.1.0.jar**&**Caused by: com.android.builder.dexing.DexArchiveBuilderException: Error while dexing**&**Caused by: com.android.tools.r8.CompilationFailedException: Compilation failed to complete**&**Caused by:com.android.tools.r8.utils.AbortException: Error: MethodHandle.invoke and MethodHandle.invokeExact are only supported starting with Android O (--min-api 26)跨度>
  • 这些答案都不适合我。我已经有多个 dex,java8,但我没有使用 guava:guava lib
猜你喜欢
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 2010-12-02
  • 2013-04-29
  • 2017-04-14
  • 1970-01-01
  • 2023-03-23
相关资源
最近更新 更多