在文件数据挖掘处理技术中,如何从文本数据中抽取有价值的信息和知识是一个重要的数据挖据分支,是机器学习、自然语言处理、数理统计的基础技术之一,是信息检索,机器学习,AI智能等高端技术的的底层技术支持之一。
Graccvs组件智能分析提取其他各种文件中文本,为自然语言信息检索,机器学习等高端技术提供底层支持的技术组件,是Lucene/CLucene, Elasticsearch, Sphinx等全文检索工具,OA, ERP, CRM,网盘,文件管理等其他系统提供文件摘要及搜索前置服务。同时也可以为安全网关,邮件内容监控,内网安全等系统提供文件搜索及监控服务底层技术支持。
Graccvs组件是以xcframework格式提供的,点击这里下载。 同时网站提供XCode工程示例说明, 点击这里查看说明。
以下简单的调用过程:
1:创建工程。
2:工程导入Graccvs.xcframework(把文件夹拖拽到工程中即可)。
3: 主单元(示例中为:ViewController.h)增加头文件和定义GraccvsGraccvsLib类,如下:
#import <Graccvs/Graccvs.h>
@property (strong, nonatomic) GraccvsGraccvsLib *hx;
4:调用初始化函数Load设置动态链接库需要的临时文件夹。
5:调用Auth注册,免费版本设置为空。
6:调用文件函数ToTextFile、HttpToString等提取N个不同文件的正文,或者使用异步函数批量处理文件。
7:完成文件提取任务后调用 Unload函数,释放资源组件使用的资源。
主要代码单元函数定义 ViewController.h:
// ViewController.h
// libTest
//
// Created by graccvs on 2021/10/31.
#import <UIKit/UIKit.h>
#import <Hello/Hello.h>
#import <Graccvs/Graccvs.h>
@interface ViewController : UIViewController
@property (strong, nonatomic) GraccvsGraccvsLib *hx;
//
@property (nonatomic,retain)IBOutlet UIButton *btToString;
@property (nonatomic,retain)IBOutlet UIButton *btToText;
@property (nonatomic,retain)IBOutlet UIButton *btHttpToString;
@property (nonatomic,retain)IBOutlet UIButton *btHttpToText;
@property (nonatomic,retain)IBOutlet UITextView *textView1;
- (void)alertX:(NSString *)msg;
- (IBAction) toString:(id)obj;
- (IBAction) toText:(id)obj;
@end
ViewController.m
//
// ViewController.m
// libTest
//
// Created by graccvs on 2021/10/31.
//
#import "ViewController.h"
@interface ViewController ()
@end
// 根据错误类型返回错误信息
NSString *errorDesc(int32_t code)
{
switch (code)
{
case 0:
return @"ok";
case 1:
return @"未知错误";
case 2:
return @"提取源文件不存在";
case 3:
return @"保存目标文件失败";
case 4:
return @"提取的源文件超出设置的大小范围";
case 5:
return @"不支持的提取文件格式";
case 6:
return @"得到接口失败";
case 7 :
return @"HTTP下载文件失败";
case 8 :
return @"HTTP文件为空";
case 9:
return @"软件许可错误";
default:
return @"未知错误2";
}
}
@implementation ViewController
- (void)viewDidLoad {
[super viewDidLoad];
//调用需要的临时文件夹,需要此文件夹的读写权限
NSArray *paths = NSSearchPathForDirectoriesInDomains(NSDocumentDirectory, NSUserDomainMask, YES);
NSString *temp = [paths objectAtIndex:0];
self.hx = [GraccvsGraccvsLib new];
[self.hx load: temp];
NSString *sn = @"";
[self.hx auth:@"Beij Gaya" licText:sn];
}
- (void)viewDidUnload {
//调用unload函数
[self.hx unload];
[super viewDidUnload];
}
// 提示信息
- (void)alertX:(NSString *)msg {
UIAlertController *alert = [UIAlertController alertControllerWithTitle:@"提示" message:msg preferredStyle:UIAlertControllerStyleAlert];
[alert addAction:[UIAlertAction actionWithTitle:@"确定" style:UIAlertActionStyleDefault handler:nil]];
// 弹出对话框
[self presentViewController:alert animated:true completion:nil];
}
// ------------------------提取正文,返回字符串------------------------
- (IBAction)toString:(id)obj {
//源文件
NSBundle *bundle = [NSBundle bundleWithPath:[[NSBundle mainBundle] pathForResource:@"testFiles" ofType:@"bundle"]];
NSString *fn = [bundle pathForResource:@"Adobe Intro" ofType:@".ofd"];
//const char * c1 =[filePath UTF8String];
NSFileManager *fileManager = [NSFileManager defaultManager];
if (![fileManager fileExistsAtPath:fn]) {
[self alertX: @"file not exists!"];
return;
}
//调用toString函数
NSString *s = [self.hx toString:fn];
if ([s hasPrefix:@"@ErrCode"] == 0)
{
self.textView1.text = s;
}else{
//得到错误信息,也可以得到@ErrCode返回的整形参数,使用errorDesc函数得到错误提示
NSString *err = [self.hx lastErr];
self.textView1.text = err;
}
}
// ------------------------提取正文并保存为文本文件------------------------
- (IBAction)toText:(id)obj {
//源文件
NSBundle *bundle = [NSBundle bundleWithPath:[[NSBundle mainBundle] pathForResource:@"testFiles" ofType:@"bundle"]];
NSString *fn = [bundle pathForResource:@"简可信模板OCR识别工具帮助" ofType:@".docx"];
//目标文件
NSArray *paths = NSSearchPathForDirectoriesInDomains(NSDocumentDirectory, NSUserDomainMask, YES);
NSString *documentsDirectory = [paths objectAtIndex:0];
NSString *outTextFile = [documentsDirectory stringByAppendingPathComponent:@"out001.txt"];
//调用toTextFile函数
int32_t code = [self.hx toTextFile:fn outFile:outTextFile];
if(code == 0){
NSString *txtData = [NSString stringWithContentsOfFile:outTextFile encoding:NSUTF8StringEncoding error:nil];
self.textView1.text = txtData;
}else{
//根据错误代码得到错误信息, 也可以使用 [self.hx lastErr]得到错误信息
self.textView1.text = errorDesc(code);
}
}
// ------------------------HTTP提取正文,返回字符串------------------------
- (IBAction)httpToString:(id)obj {
NSString *url = @"https://www.gaya-soft.cn/dfs/v2/graccvs文件正文提取接口.pdf";
NSString *params2 = @"{\"headers\":[{\"client_id\": \"g01x9\"}, {\"client_secret\": \"e23c89cc9fe\"}], \"cookies\":[{\"name\": \"ga\", \"value\": \"1020\", \"expires\":36000000, \"path\": \"/\"}]}";
//调用httpToString函数, Timeout为超时设置,单位为毫秒
NSString *s = [self.hx httpToString:url fileExt:@".pdf" timeout:180*1000 params:params2];
if ([s hasPrefix:@"@ErrCode"] == 0)
{
self.textView1.text = s;
}else{
//得到错误信息,也可以得到@ErrCode返回的整形参数,使用errorDesc函数得到错误提示
NSString *err = [self.hx lastErr];
self.textView1.text = err;
}
}
// ------------------------HTTP提取正文并保存为文本文件------------------------
- (IBAction)httpToTextFile:(id)obj {
NSString *url = @"https://www.gaya-soft.cn/dfs/v2/简可信模板OCR识别工具帮助.docx";
//目标文件
NSArray *paths = NSSearchPathForDirectoriesInDomains(NSDocumentDirectory, NSUserDomainMask, YES);
NSString *documentsDirectory = [paths objectAtIndex:0];
NSString *outFile = [documentsDirectory stringByAppendingPathComponent:@"out002.txt"];
//调用httpToTextFile函数, Timeout为超时设置,单位为毫秒
int32_t code = [self.hx httpToTextFile:url fileExt:@".docx" outTxtFile:outFile timeout: 0 params:@""];
if(code == 0){
NSString *txtData = [NSString stringWithContentsOfFile:outFile encoding:NSUTF8StringEncoding error:nil];
self.textView1.text = txtData;
}else{
//根据错误代码得到错误信息, 也可以使用 [self.hx lastErr]得到错误信息
self.textView1.text = errorDesc(code);
}
}
@end
函数定义 Graccvs.xcframework\ios-arm64\Graccvs.framework\Versions\A\Headers\Graccvs.objc.h:
// Objective-C API for talking to graccvs Go package.
// gobind -lang=objc graccvs
//
// File is generated by gobind. Do not edit.
#ifndef __Graccvs_H__
#define __Graccvs_H__
@import Foundation;
#include "ref.h"
#include "Universe.objc.h"
@class GraccvsGraccvsLib;
@interface GraccvsGraccvsLib : NSObject <goSeqRefInterface> {
}
@property(strong, readonly) _Nonnull id _ref;
- (nonnull instancetype)initWithRef:(_Nonnull id)ref;
- (nonnull instancetype)init;
/**
* soft register
*/
- (int32_t)auth:(NSString* _Nullable)corp licText:(NSString* _Nullable)licText;
/**
* get http file and extract the text
"url" http/https url of file
"fileExt" extract text as this suffix
"timeout" millisecond, timeout parameter of get http file,
*/
- (NSString* _Nonnull)httpToString:(NSString* _Nullable)url fileExt:(NSString* _Nullable)fileExt timeout:(int32_t)timeout params:(NSString* _Nullable)params;
/**
* get http file, extract the text and save to file
"url" http/https url of file
"fileExt" extract text as this suffix
"outTxtFile" output filename
"timeout" millisecond, timeout parameter of get http file,
"params" is http Headers and cookies
*/
- (int32_t)httpToTextFile:(NSString* _Nullable)url fileExt:(NSString* _Nullable)fileExt outTxtFile:(NSString* _Nullable)outTxtFile timeout:(int32_t)timeout params:(NSString* _Nullable)params;
/**
* last error
*/
- (NSString* _Nonnull)lastErr;
/**
* dynamic library init
"tempdir" is temporary folder with write right
*/
- (void)load:(NSString* _Nullable)tempdir;
- (void)logTest:(NSString* _Nullable)inFile text:(NSString* _Nullable)text;
/**
* extract file text and return string
*/
- (NSString* _Nonnull)toString:(NSString* _Nullable)inFile;
/**
* extract file text and save to text file
"inFile" is input filename, "outFile" is output filename
*/
- (int32_t)toTextFile:(NSString* _Nullable)inFile outFile:(NSString* _Nullable)outFile;
/**
* call this function before exit
*/
- (void)unload;
@end
#endif