正文
记一次针对excel导出的优化
小程序:扫一扫查出行
【扫一扫了解最新限行尾号】
复制小程序
【扫一扫了解最新限行尾号】
复制小程序
最近发现我们系统导出excel文件时由于是导出百万级数据导出,速度过慢并且内存占用多,故进行了下面的一次优化。
我们使用apache的poi进行excel文件操作
主要耗时:
1、从数据库得到需要导出的结果集耗时
2、将数据写入excel耗时
优化前
public abstract class BaseExcelOutputVo {
public BaseExcelOutputVo() {
} public abstract String[] toExcelHeaders(); public abstract Object[] toExcelData(); public abstract String getExcelTitle();
}public class ExcelUtils { private static Logger log = LoggerFactory.getLogger(ExcelUtils.class);
/**
* 未分页数据导出
* @param request
* @param response
* @param dataList
* @param downloadName
* @param fileName
* @param pageSize 每个excel文件最大记录数
*/
public static <T extends BaseExcelOutputVo> void excelExport(HttpServletRequest request, HttpServletResponse response, List<T> dataList,String downloadName,String fileName,int pageSize){
//List<List<T>> list = Lists.partition(dataList,pageSize);
Stopwatch sw = Stopwatch.createStarted();
List<List<T>> list = new ArrayList<>(); Iterator<T> it = dataList.iterator();
for(int pages = dataList.size()/pageSize +1 ; pages>0; pages--){
List<T> dataPage = new ArrayList<>();
for(int num = 0 ;num < pageSize ; num++){
if(it.hasNext()){
dataPage.add(it.next());}
else{
break;
}
}
list.add(dataPage);
}
int size = dataList.size();
dataList.clear(); excelExport(request,response,list,downloadName,fileName);
log.trace("excel导出记录{}条,耗时{}",size,sw.stop().elapsed(TimeUnit.MILLISECONDS)); }
/**
* 将已经分好页的数据,按照分页导出多个excel,并打包为压缩包响应
* @param request
* @param response
* @param dataList 用于生成excel的数据,不同list生成不同excel文件
* @param downloadName 压缩包名字,以.zip结尾
* @param fileName excel文件名字前缀,不需要格式名
*/
public static <T extends BaseExcelOutputVo> void excelExport(HttpServletRequest request, HttpServletResponse response, List<List<T>> dataList,String downloadName,String fileName){
List<Workbook> excelList = new ArrayList<>(); for(List<T> data :dataList){
if(data.size()>0)
excelList.add(createExcel(data));
}
if(excelList.size() > 0 ) {
try {
downloadFiles(request, response, downloadName, fileName, excelList);
} catch (Exception e) {
e.printStackTrace();
}
}
} public static <T extends BaseExcelOutputVo> Workbook createExcel(List<T> data) {
if(data.size()>0) {
XSSFWorkbook workbook = null;
try {
workbook = (XSSFWorkbook) Class.forName("org.apache.poi.xssf.usermodel.XSSFWorkbook").newInstance(); //指定 sheet 的名字
Sheet sheet = workbook.createSheet(data.get(0).getExcelTitle()); // 列数
int cols = data.get(0).toExcelHeaders().length;
//复制一行用于设置header
data.add(0,data.get(0));
int rows = data.size();
int index = 0; //设置列头样式
XSSFCellStyle headerStyle = workbook.createCellStyle();
headerStyle.setAlignment(HSSFCellStyle.ALIGN_CENTER); // 居中格式
headerStyle.setFillForegroundColor(IndexedColors.SKY_BLUE.getIndex());// 背景色
headerStyle.setFillPattern(HSSFCellStyle.SOLID_FOREGROUND); //加粗 for (int rowNum = 0; rowNum < rows; rowNum++) {
Row row = sheet.createRow(rowNum); for (int colNum = 0; colNum < cols; colNum++) {
Cell cell = row.createCell(colNum);
if(rowNum == 0){
cell.setCellValue(data.get(0).toExcelHeaders()[colNum]);
cell.setCellStyle(headerStyle);
}else {
Object obj = data.get(index).toExcelData()[colNum];
if(obj instanceof Date) {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm");
cell.setCellValue(sdf.format((Date) obj));
}else if(obj instanceof Integer){
cell.setCellValue((Integer)obj);
}else if(obj instanceof Double){
cell.setCellValue((Double)obj);
}else if(obj instanceof Long){
cell.setCellValue((Long)obj);
}else{
cell.setCellValue((String)obj);
}
}
sheet.autoSizeColumn(colNum);
sheet.setColumnWidth(colNum,sheet.getColumnWidth(colNum)*17/10);
} index++;
}
return workbook;
} catch (InstantiationException e) {
e.printStackTrace();
} catch (IllegalAccessException e) {
e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
} }
return null;
} private static void downloadFiles(HttpServletRequest request, HttpServletResponse response,String downloadName,String fileName,List<Workbook> excelList) throws Exception { //响应头的设置
response.reset();
response.setCharacterEncoding("utf-8");
response.setContentType("multipart/form-data"); //设置压缩包的名字
//解决不同浏览器压缩包名字含有中文时乱码的问题
String agent = request.getHeader("USER-AGENT");
try {
if (agent.contains("MSIE")||agent.contains("Trident")) {
downloadName = java.net.URLEncoder.encode(downloadName, "UTF-8");
} else {
downloadName = new String(downloadName.getBytes("UTF-8"),"ISO-8859-1");
}
} catch (Exception e) {
e.printStackTrace();
}
response.setHeader("Content-Disposition", "attachment;fileName=\"" + downloadName + "\""); //设置压缩流:直接写入response,实现边压缩边下载
ZipOutputStream zipos = null;
try {
zipos = new ZipOutputStream(new BufferedOutputStream(response.getOutputStream()));
zipos.setMethod(ZipOutputStream.DEFLATED); //设置压缩方法
} catch (Exception e) {
e.printStackTrace();
} //循环将文件写入压缩流
DataOutputStream os = null;
for(int i = 0; i < excelList.size(); i++ ){ InputStream in = null;
try{
//将excel文件转为输入流
ByteArrayOutputStream out = new ByteArrayOutputStream();
excelList.get(i).write(out);
byte [] bookByteAry = out.toByteArray();
in = new ByteArrayInputStream(bookByteAry); //添加ZipEntry,并ZipEntry中写入文件流
//这里,加上i是防止要下载的文件有重名的导致下载失败
zipos.putNextEntry(new ZipEntry( fileName + i +".xlsx"));
os = new DataOutputStream(zipos); byte[] b = new byte[100];
int length = 0;
while((length = in.read(b))!= -1){
os.write(b, 0, length);
}
in.close();
zipos.closeEntry();
} catch (IOException e) {
e.printStackTrace();
}
} //关闭流
try {
os.flush();
os.close();
zipos.close();
} catch (IOException e) {
e.printStackTrace();
} }
}
原有的导出工具类测试情况
excel导出记录5条,耗时574 ms
查询耗时 334 ms
导出总耗时908 msexcel导出记录501条,耗时23638 ms
查询耗时 476 ms
导出总耗时24114 ms数据转excel时间234429 ms
完成响应耗时234516 ms
excel导出记录5000条,耗时234516 ms
导出总耗时235518 msexcel导出记录10001条,耗时460735 ms
查询耗时 2766 ms
导出总耗时463501 ms
发现主要耗时集中在生成excel文件上,随着导出记录数增加该比重明显上升,1万记录时已经占了99.5%
注意XSSFWorkbook这个类是个坑
不知道各位有没发现一个奇怪的事情,500条记录的导出也花费了20S左右的时间。按理说500条只是一个很小的数目,20S左右怎么也能导上万条记录了。为此,我对每个row的生成时间打印了出来,发现开始时写的时间是10ms/行,500行时慢慢增加到100ms/行,对于内容相近的每行写入时间增加了有10倍,感觉是因为不停插入数据导致excel的对象变大了的原因?改用SXSSFWorkbook后写速度稳定在5ms/行左右。SXSSFWorkbook相对与XSSFWorkbook来讲,使用了滑动窗口的方式,只保存设置的最大记录数在内存中(默认是100行),其他的记录将会被保存在一个临时文件当中,以此来降低大数据量操作时的内存占用,写入完成后需要调用dispose()方法将临时文件删除。并且经过测试SXSSFWorkbook的单个cell写入耗时只有XSSFWorkbook的1/10,对于不需要随机读取以前数据的程序来讲,SXSSFWorkbook是一个更好的选择。
每行三列同样长度的随机字符串写入workbook中,XSSFWorkbook与SXSSFWorkbook测试对比
StopWatch 'col0': running time = 4033300 ns
StopWatch 'col1': running time = 74000 ns
StopWatch 'col2': running time = 70200 ns
StopWatch 'row0': running time = 5570700 nsStopWatch 'col0': running time = 47800 ns
StopWatch 'col1': running time = 53200 ns
StopWatch 'col2': running time = 38400 ns
StopWatch 'row1': running time = 216300 nsStopWatch 'col0': running time = 35000 ns
StopWatch 'col1': running time = 39900 ns
StopWatch 'col2': running time = 36200 ns
StopWatch 'row2': running time = 205200 nsStopWatch 'col0': running time = 25600 ns
StopWatch 'col1': running time = 19900 ns
StopWatch 'col2': running time = 24200 ns
StopWatch 'row3': running time = 141700 nsStopWatch 'col0': running time = 23600 ns
StopWatch 'col1': running time = 21700 ns
StopWatch 'col2': running time = 24300 ns
StopWatch 'row4': running time = 185200 nsStopWatch 'col0': running time = 19500 ns
StopWatch 'col1': running time = 21000 ns
StopWatch 'col2': running time = 24900 ns
StopWatch 'row5': running time = 357100 nsStopWatch 'col0': running time = 23900 ns
StopWatch 'col1': running time = 25400 ns
StopWatch 'col2': running time = 26400 ns
StopWatch 'row6': running time = 503600 nsStopWatch 'col0': running time = 23600 ns
StopWatch 'col1': running time = 21900 ns
StopWatch 'col2': running time = 27000 ns
StopWatch 'row7': running time = 157700 nsStopWatch 'col0': running time = 63300 ns
StopWatch 'col1': running time = 21600 ns
StopWatch 'col2': running time = 24100 ns
StopWatch 'row8': running time = 193900 nsStopWatch 'col0': running time = 20700 ns
StopWatch 'col1': running time = 21100 ns
StopWatch 'col2': running time = 24300 ns
StopWatch 'row9': running time = 137000 nsStopWatch 'sxssf': running time = 565115800 ns
---------------------------------------------
ns % Task name
---------------------------------------------
565115800 100% StopWatch 'col0': running time = 17104200 ns
StopWatch 'col1': running time = 597400 ns
StopWatch 'col2': running time = 371600 ns
StopWatch 'row0': running time = 26082100 nsStopWatch 'col0': running time = 315700 ns
StopWatch 'col1': running time = 429100 ns
StopWatch 'col2': running time = 1941400 ns
StopWatch 'row1': running time = 2899200 nsStopWatch 'col0': running time = 422900 ns
StopWatch 'col1': running time = 660600 ns
StopWatch 'col2': running time = 327400 ns
StopWatch 'row2': running time = 1623800 nsStopWatch 'col0': running time = 343100 ns
StopWatch 'col1': running time = 289400 ns
StopWatch 'col2': running time = 275700 ns
StopWatch 'row3': running time = 1072900 nsStopWatch 'col0': running time = 275900 ns
StopWatch 'col1': running time = 258700 ns
StopWatch 'col2': running time = 613400 ns
StopWatch 'row4': running time = 1330700 nsStopWatch 'col0': running time = 899900 ns
StopWatch 'col1': running time = 264500 ns
StopWatch 'col2': running time = 235400 ns
StopWatch 'row5': running time = 1572100 nsStopWatch 'col0': running time = 224000 ns
StopWatch 'col1': running time = 219000 ns
StopWatch 'col2': running time = 248200 ns
StopWatch 'row6': running time = 826200 nsStopWatch 'col0': running time = 306100 ns
StopWatch 'col1': running time = 272700 ns
StopWatch 'col2': running time = 305300 ns
StopWatch 'row7': running time = 1047000 nsStopWatch 'col0': running time = 240100 ns
StopWatch 'col1': running time = 246600 ns
StopWatch 'col2': running time = 278600 ns
StopWatch 'row8': running time = 921000 nsStopWatch 'col0': running time = 248300 ns
StopWatch 'col1': running time = 239700 ns
StopWatch 'col2': running time = 245300 ns
StopWatch 'row9': running time = 869600 nsStopWatch 'xssf': running time = 44311200 ns
---------------------------------------------
ns % Task name
---------------------------------------------
044311200 100%
上述excel工具类有两个问题
- 导出的耗时明显的长
- 无论是使用哪一个excelExport入口,它都需要应用先把所有要导出的数据查出放到List中,这样当数据量去到几百万的时候会占用大量应用内存
优化一
为了避免多个线程同时写一个文件出现问题,目前采取了N条记录一个文件,每个文件一个线程写的设计。分sheet操作感觉也能尝试,至于同一个sheet的操作因为poi不是线程安全的,网上我看过几个demo好像都是会报错,不建议使用。
以下是使用多线程写文件改造后核心代码,这里最大线程数最多设置为电脑可用线程数2,测试过3或者*8速度会更快,不过这个功能不是核心功能,所以就不占用太多资源
static final int nThreads = Runtime.getRuntime().availableProcessors();
public static <T extends BaseExcelOutputVo> List<Workbook> createExcel_v2(List<List<T>> data) { CountDownLatch countDownLatch = new CountDownLatch(data.size());
List<Workbook> excelList = new CopyOnWriteArrayList<>();
if (data.size() > 0) {
ExecutorService executor = Executors.newFixedThreadPool(data.size() < nThreads*2 ? data.size() : nThreads*2);
data.forEach(list -> executor.execute(new pageTask(countDownLatch, list, excelList)));
}
try {
countDownLatch.await();
} catch (InterruptedException e) {
e.printStackTrace();
}
return excelList;
} static class pageTask<T extends BaseExcelOutputVo> implements Runnable {
private CountDownLatch countDownLatch;
private List<T> data;
private List<SXSSFWorkbook> excelList; public pageTask(CountDownLatch countDownLatch, List<T> data, List<SXSSFWorkbook> excelList) {
this.countDownLatch = countDownLatch;
this.data = data;
this.excelList = excelList;
} public pageTask(List<T> data, List<SXSSFWorkbook> excelList){ this.data = data;
this.excelList = excelList;
} @Override
public void run() {
try {
Stopwatch sw = Stopwatch.createStarted();
SXSSFWorkbook workbook = new SXSSFWorkbook(100);
//设置列头样式
CellStyle headerStyle = workbook.createCellStyle();
headerStyle.setAlignment(CellStyle.ALIGN_CENTER); // 居中格式
headerStyle.setFillForegroundColor(IndexedColors.SKY_BLUE.getIndex());// 背景色
headerStyle.setFillPattern(CellStyle.SOLID_FOREGROUND); //加粗 SXSSFSheet sheet = workbook.createSheet(data.get(0).getExcelTitle()); CellStyle textStyle = workbook.createCellStyle();
DataFormat format = workbook.createDataFormat();
textStyle.setDataFormat(format.getFormat("@")); // 列数
int cols = data.get(0).toExcelHeaders().length;
//复制一行用于设置header
data.add(0, data.get(0));
int rows = data.size();
int index = 0; for (int rowNum = 0; rowNum < rows; rowNum++) {
SXSSFRow row = sheet.createRow(rowNum); for (int colNum = 0; colNum < cols; colNum++) {
Cell cell = row.createCell(colNum);
if (rowNum == 0) {
cell.setCellValue(data.get(0).toExcelHeaders()[colNum]);
cell.setCellStyle(headerStyle);
} else {
cell.setCellStyle(textStyle);
cell.setCellType(XSSFCell.CELL_TYPE_STRING);
Object obj = data.get(index).toExcelData()[colNum];
if (obj instanceof Date) {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm");
cell.setCellValue(sdf.format((Date) obj));
} else if (obj instanceof Integer) {
cell.setCellValue(String.valueOf(obj));
} else if (obj instanceof Double) {
cell.setCellValue(String.valueOf(obj));
} else if (obj instanceof Long) {
cell.setCellValue(String.valueOf(obj));
} else {
cell.setCellValue((String) obj);
}
}
}
//log.trace("设置行{}完成,耗时{}",index,rowNum,sw.elapsed(TimeUnit.MILLISECONDS));
index++; }
for (int colNum = 0; colNum < cols; colNum++) {
sheet.trackAllColumnsForAutoSizing();
sheet.autoSizeColumn(colNum);
sheet.setColumnWidth(colNum, sheet.getColumnWidth(colNum) * 17 / 10);
}
excelList.add(workbook);
//workbook.dispose();
log.trace("线程{}创建excel完成,耗时{}", Thread.currentThread().getName(), sw.stop().elapsed(TimeUnit.MILLISECONDS));
} catch (Exception e) {
log.trace("线程名{}data大小{}data是否为空{}", Thread.currentThread().getName(), data.size(), data == null);
e.printStackTrace();
} finally {
if (countDownLatch != null) {
countDownLatch.countDown();
}
//这里处理完后清空数据列表,让它可以回收,之前没释放导出的时候堆一直往上涨的厉害,释放后少了大概25%
data.clear();
data = null;
}
} }
测试情况,500行一个文件
线程pool-12-thread-7创建excel完成,耗时716
线程pool-12-thread-5创建excel完成,耗时733
线程pool-12-thread-6创建excel完成,耗时734
线程pool-12-thread-1创建excel完成,耗时750
线程pool-12-thread-11创建excel完成,耗时756
线程pool-12-thread-9创建excel完成,耗时776
...
excel导出记录5500条,耗时1564
导出总耗时3074线程pool-12-thread-10创建excel完成,耗时646 ms
线程pool-12-thread-3创建excel完成,耗时648 ms
线程pool-12-thread-6创建excel完成,耗时668 ms
线程pool-12-thread-7创建excel完成,耗时678 ms
线程pool-12-thread-5创建excel完成,耗时682 ms
线程pool-12-thread-9创建excel完成,耗时684 ms
线程pool-12-thread-4创建excel完成,耗时687 ms
线程pool-12-thread-1创建excel完成,耗时707 ms
线程pool-12-thread-2创建excel完成,耗时716 ms
线程pool-12-thread-8创建excel完成,耗时721 ms
excel导出记录5000条,耗时1479 ms
导出总耗时2673 ms线程pool-13-thread-1创建excel完成,耗时109
excel导出记录500条,耗时181
导出总耗时436
测试情况,5000行一个文件
线程pool-15-thread-2创建excel完成,耗时35
线程pool-15-thread-1创建excel完成,耗时61
excel导出记录5500条,耗时204
导出总耗时1341线程pool-14-thread-1创建excel完成,耗时56
excel导出记录5000条,耗时137
导出总耗时1103线程pool-12-thread-1创建excel完成,耗时538
excel导出记录500条,耗时708
导出总耗时999
注意不要图快开大量线程把cpu占满了,这样其他请求这机器就处理不了了,而且当线程数超过当前服务器可用线程数时,线程切换频繁,速度也会变慢。
计算最大线程数方法参考
生成excel文件速度优化后,还存在一个问题:这里用于导出的数据是一次性加载到内存中的,当数据量大的时候会占用大量内存,需改用读一部分写一部分的形式。
优化二
采用了边加载边写excel文件的形式减少内存占用,类似于生产者消费者模型,使用队列进行数据的临时存储及交换,生产者线程不停将要写入的数据从数据库取出,经处理后存入队列中,消费者线程即excel线程管理类启动后不停从队列中取数据,当数据积累够指定数量或者所有要写excel的数据已经从队列中取出时,将这些数据交给写excel线程进行写入。大概流程如下:
消费者类
public static class ExportThread<T extends BaseExcelOutputVo> implements Runnable { private LinkedBlockingQueue<T> queue;
private boolean isEnd = false;
private List<T> list = new CopyOnWriteArrayList<>();
List<Workbook> excelList;
HttpServletRequest request;
HttpServletResponse response;
String downloadName;
String fileName;
ExecutorService executor; public ExportThread(LinkedBlockingQueue<T> queue,HttpServletRequest request, HttpServletResponse response, String downloadName, String fileName) {
this.queue = queue;
this.request = request;
this.response = response;
this.downloadName = downloadName;
this.fileName = fileName;
} public void setEnd(boolean end) {
isEnd = end;
} @Override
public void run() {
try {
excelList = new CopyOnWriteArrayList<>();
executor = Executors.newFixedThreadPool(nThreads * 2); while (!isEnd || queue.size() > 0) {
while (queue.size() > 0) {
list.add(queue.poll(60 * 1000, TimeUnit.MILLISECONDS));
if(list.size() == SINGLE_FILE_SIZE){
break;
}
}
if (list.size() > 0 && (list.size() == SINGLE_FILE_SIZE || (isEnd && queue.size() ==0))) {
executor.execute(new pageTask(new CopyOnWriteArrayList(list), excelList));
list.clear();
}
}
executor.shutdown();
while(!executor.isTerminated()){ }
//log.trace("生成excel完成");
if (excelList.size() > 0) {
try {
downloadFiles(request, response, downloadName, fileName, excelList);
} catch (Exception e) {
e.printStackTrace();
}
}
} catch (InterruptedException e) {
e.printStackTrace();
} } }
测试情况
文件分页 5000&查询分页 5000
数据量 | 耗时ms |
---|---|
一千 | 1162 |
十万 | 18579 |
一百万 | 611380 |
一百万的数据量时单次查询会越来越慢,是mysql分页查询里面limit偏移量变大的影响。
如果这样优化后导出还是需要十秒以上,为了避免导出数据太多导致页面响应超时,可以采用异步的形式,后端接收到导出请求并检查参数正确后返回成功,再提供一个接口用来查询该次导出是否已完成,完成再返回文件,未完成则需要页面过段时间再次调用查询接口检查。