记一次针对excel导出的优化

小程序：扫一扫查出行
【扫一扫了解最新限行尾号】
复制小程序

最近发现我们系统导出excel文件时由于是导出百万级数据导出，速度过慢并且内存占用多，故进行了下面的一次优化。

我们使用apache的poi进行excel文件操作

主要耗时：

1、从数据库得到需要导出的结果集耗时

2、将数据写入excel耗时

优化前

public abstract class BaseExcelOutputVo { public BaseExcelOutputVo() { } public abstract String[] toExcelHeaders(); public abstract Object[] toExcelData(); public abstract String getExcelTitle(); }public class ExcelUtils { private static Logger log = LoggerFactory.getLogger(ExcelUtils.class); /** * 未分页数据导出 * @param request * @param response * @param dataList * @param downloadName * @param fileName * @param pageSize 每个excel文件最大记录数 */ public static <T extends BaseExcelOutputVo> void excelExport(HttpServletRequest request, HttpServletResponse response, List<T> dataList,String downloadName,String fileName,int pageSize){ //List<List<T>> list = Lists.partition(dataList,pageSize); Stopwatch sw = Stopwatch.createStarted(); List<List<T>> list = new ArrayList<>(); Iterator<T> it = dataList.iterator(); for(int pages = dataList.size()/pageSize +1 ; pages>0; pages--){ List<T> dataPage = new ArrayList<>(); for(int num = 0 ;num < pageSize ; num++){ if(it.hasNext()){ dataPage.add(it.next());} else{ break; } } list.add(dataPage); } int size = dataList.size(); dataList.clear(); excelExport(request,response,list,downloadName,fileName); log.trace("excel导出记录{}条，耗时{}",size,sw.stop().elapsed(TimeUnit.MILLISECONDS)); } /** * 将已经分好页的数据，按照分页导出多个excel，并打包为压缩包响应 * @param request * @param response * @param dataList 用于生成excel的数据，不同list生成不同excel文件 * @param downloadName 压缩包名字，以.zip结尾 * @param fileName excel文件名字前缀,不需要格式名 */ public static <T extends BaseExcelOutputVo> void excelExport(HttpServletRequest request, HttpServletResponse response, List<List<T>> dataList,String downloadName,String fileName){ List<Workbook> excelList = new ArrayList<>(); for(List<T> data :dataList){ if(data.size()>0) excelList.add(createExcel(data)); } if(excelList.size() > 0 ) { try { downloadFiles(request, response, downloadName, fileName, excelList); } catch (Exception e) { e.printStackTrace(); } } } public static <T extends BaseExcelOutputVo> Workbook createExcel(List<T> data) { if(data.size()>0) { XSSFWorkbook workbook = null; try { workbook = (XSSFWorkbook) Class.forName("org.apache.poi.xssf.usermodel.XSSFWorkbook").newInstance(); //指定 sheet 的名字 Sheet sheet = workbook.createSheet(data.get(0).getExcelTitle()); // 列数 int cols = data.get(0).toExcelHeaders().length; //复制一行用于设置header data.add(0,data.get(0)); int rows = data.size(); int index = 0; //设置列头样式 XSSFCellStyle headerStyle = workbook.createCellStyle(); headerStyle.setAlignment(HSSFCellStyle.ALIGN_CENTER); // 居中格式 headerStyle.setFillForegroundColor(IndexedColors.SKY_BLUE.getIndex());// 背景色 headerStyle.setFillPattern(HSSFCellStyle.SOLID_FOREGROUND); //加粗 for (int rowNum = 0; rowNum < rows; rowNum++) { Row row = sheet.createRow(rowNum); for (int colNum = 0; colNum < cols; colNum++) { Cell cell = row.createCell(colNum); if(rowNum == 0){ cell.setCellValue(data.get(0).toExcelHeaders()[colNum]); cell.setCellStyle(headerStyle); }else { Object obj = data.get(index).toExcelData()[colNum]; if(obj instanceof Date) { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm"); cell.setCellValue(sdf.format((Date) obj)); }else if(obj instanceof Integer){ cell.setCellValue((Integer)obj); }else if(obj instanceof Double){ cell.setCellValue((Double)obj); }else if(obj instanceof Long){ cell.setCellValue((Long)obj); }else{ cell.setCellValue((String)obj); } } sheet.autoSizeColumn(colNum); sheet.setColumnWidth(colNum,sheet.getColumnWidth(colNum)*17/10); } index++; } return workbook; } catch (InstantiationException e) { e.printStackTrace(); } catch (IllegalAccessException e) { e.printStackTrace(); } catch (ClassNotFoundException e) { e.printStackTrace(); } } return null; } private static void downloadFiles(HttpServletRequest request, HttpServletResponse response,String downloadName,String fileName,List<Workbook> excelList) throws Exception { //响应头的设置 response.reset(); response.setCharacterEncoding("utf-8"); response.setContentType("multipart/form-data"); //设置压缩包的名字 //解决不同浏览器压缩包名字含有中文时乱码的问题 String agent = request.getHeader("USER-AGENT"); try { if (agent.contains("MSIE")||agent.contains("Trident")) { downloadName = java.net.URLEncoder.encode(downloadName, "UTF-8"); } else { downloadName = new String(downloadName.getBytes("UTF-8"),"ISO-8859-1"); } } catch (Exception e) { e.printStackTrace(); } response.setHeader("Content-Disposition", "attachment;fileName=\"" + downloadName + "\""); //设置压缩流：直接写入response，实现边压缩边下载 ZipOutputStream zipos = null; try { zipos = new ZipOutputStream(new BufferedOutputStream(response.getOutputStream())); zipos.setMethod(ZipOutputStream.DEFLATED); //设置压缩方法 } catch (Exception e) { e.printStackTrace(); } //循环将文件写入压缩流 DataOutputStream os = null; for(int i = 0; i < excelList.size(); i++ ){ InputStream in = null; try{ //将excel文件转为输入流 ByteArrayOutputStream out = new ByteArrayOutputStream(); excelList.get(i).write(out); byte [] bookByteAry = out.toByteArray(); in = new ByteArrayInputStream(bookByteAry); //添加ZipEntry，并ZipEntry中写入文件流 //这里，加上i是防止要下载的文件有重名的导致下载失败 zipos.putNextEntry(new ZipEntry( fileName + i +".xlsx")); os = new DataOutputStream(zipos); byte[] b = new byte[100]; int length = 0; while((length = in.read(b))!= -1){ os.write(b, 0, length); } in.close(); zipos.closeEntry(); } catch (IOException e) { e.printStackTrace(); } } //关闭流 try { os.flush(); os.close(); zipos.close(); } catch (IOException e) { e.printStackTrace(); } } }

原有的导出工具类测试情况

excel导出记录5条，耗时574 ms 查询耗时 334 ms 导出总耗时908 msexcel导出记录501条，耗时23638 ms 查询耗时 476 ms 导出总耗时24114 ms数据转excel时间234429 ms 完成响应耗时234516 ms excel导出记录5000条，耗时234516 ms 导出总耗时235518 msexcel导出记录10001条，耗时460735 ms 查询耗时 2766 ms 导出总耗时463501 ms

发现主要耗时集中在生成excel文件上，随着导出记录数增加该比重明显上升，1万记录时已经占了99.5%

注意XSSFWorkbook这个类是个坑

不知道各位有没发现一个奇怪的事情，500条记录的导出也花费了20S左右的时间。按理说500条只是一个很小的数目，20S左右怎么也能导上万条记录了。为此，我对每个row的生成时间打印了出来，发现开始时写的时间是10ms/行，500行时慢慢增加到100ms/行，对于内容相近的每行写入时间增加了有10倍，感觉是因为不停插入数据导致excel的对象变大了的原因？改用SXSSFWorkbook后写速度稳定在5ms/行左右。SXSSFWorkbook相对与XSSFWorkbook来讲，使用了滑动窗口的方式，只保存设置的最大记录数在内存中（默认是100行），其他的记录将会被保存在一个临时文件当中，以此来降低大数据量操作时的内存占用，写入完成后需要调用dispose()方法将临时文件删除。并且经过测试SXSSFWorkbook的单个cell写入耗时只有XSSFWorkbook的1/10，对于不需要随机读取以前数据的程序来讲，SXSSFWorkbook是一个更好的选择。

每行三列同样长度的随机字符串写入workbook中，XSSFWorkbook与SXSSFWorkbook测试对比

StopWatch 'col0': running time = 4033300 ns StopWatch 'col1': running time = 74000 ns StopWatch 'col2': running time = 70200 ns StopWatch 'row0': running time = 5570700 nsStopWatch 'col0': running time = 47800 ns StopWatch 'col1': running time = 53200 ns StopWatch 'col2': running time = 38400 ns StopWatch 'row1': running time = 216300 nsStopWatch 'col0': running time = 35000 ns StopWatch 'col1': running time = 39900 ns StopWatch 'col2': running time = 36200 ns StopWatch 'row2': running time = 205200 nsStopWatch 'col0': running time = 25600 ns StopWatch 'col1': running time = 19900 ns StopWatch 'col2': running time = 24200 ns StopWatch 'row3': running time = 141700 nsStopWatch 'col0': running time = 23600 ns StopWatch 'col1': running time = 21700 ns StopWatch 'col2': running time = 24300 ns StopWatch 'row4': running time = 185200 nsStopWatch 'col0': running time = 19500 ns StopWatch 'col1': running time = 21000 ns StopWatch 'col2': running time = 24900 ns StopWatch 'row5': running time = 357100 nsStopWatch 'col0': running time = 23900 ns StopWatch 'col1': running time = 25400 ns StopWatch 'col2': running time = 26400 ns StopWatch 'row6': running time = 503600 nsStopWatch 'col0': running time = 23600 ns StopWatch 'col1': running time = 21900 ns StopWatch 'col2': running time = 27000 ns StopWatch 'row7': running time = 157700 nsStopWatch 'col0': running time = 63300 ns StopWatch 'col1': running time = 21600 ns StopWatch 'col2': running time = 24100 ns StopWatch 'row8': running time = 193900 nsStopWatch 'col0': running time = 20700 ns StopWatch 'col1': running time = 21100 ns StopWatch 'col2': running time = 24300 ns StopWatch 'row9': running time = 137000 nsStopWatch 'sxssf': running time = 565115800 ns --------------------------------------------- ns % Task name --------------------------------------------- 565115800 100% StopWatch 'col0': running time = 17104200 ns StopWatch 'col1': running time = 597400 ns StopWatch 'col2': running time = 371600 ns StopWatch 'row0': running time = 26082100 nsStopWatch 'col0': running time = 315700 ns StopWatch 'col1': running time = 429100 ns StopWatch 'col2': running time = 1941400 ns StopWatch 'row1': running time = 2899200 nsStopWatch 'col0': running time = 422900 ns StopWatch 'col1': running time = 660600 ns StopWatch 'col2': running time = 327400 ns StopWatch 'row2': running time = 1623800 nsStopWatch 'col0': running time = 343100 ns StopWatch 'col1': running time = 289400 ns StopWatch 'col2': running time = 275700 ns StopWatch 'row3': running time = 1072900 nsStopWatch 'col0': running time = 275900 ns StopWatch 'col1': running time = 258700 ns StopWatch 'col2': running time = 613400 ns StopWatch 'row4': running time = 1330700 nsStopWatch 'col0': running time = 899900 ns StopWatch 'col1': running time = 264500 ns StopWatch 'col2': running time = 235400 ns StopWatch 'row5': running time = 1572100 nsStopWatch 'col0': running time = 224000 ns StopWatch 'col1': running time = 219000 ns StopWatch 'col2': running time = 248200 ns StopWatch 'row6': running time = 826200 nsStopWatch 'col0': running time = 306100 ns StopWatch 'col1': running time = 272700 ns StopWatch 'col2': running time = 305300 ns StopWatch 'row7': running time = 1047000 nsStopWatch 'col0': running time = 240100 ns StopWatch 'col1': running time = 246600 ns StopWatch 'col2': running time = 278600 ns StopWatch 'row8': running time = 921000 nsStopWatch 'col0': running time = 248300 ns StopWatch 'col1': running time = 239700 ns StopWatch 'col2': running time = 245300 ns StopWatch 'row9': running time = 869600 nsStopWatch 'xssf': running time = 44311200 ns --------------------------------------------- ns % Task name --------------------------------------------- 044311200 100%

上述excel工具类有两个问题

导出的耗时明显的长
无论是使用哪一个excelExport入口，它都需要应用先把所有要导出的数据查出放到List中，这样当数据量去到几百万的时候会占用大量应用内存

优化一

为了避免多个线程同时写一个文件出现问题，目前采取了N条记录一个文件，每个文件一个线程写的设计。分sheet操作感觉也能尝试，至于同一个sheet的操作因为poi不是线程安全的，网上我看过几个demo好像都是会报错，不建议使用。

以下是使用多线程写文件改造后核心代码，这里最大线程数最多设置为电脑可用线程数2，测试过3或者*8速度会更快，不过这个功能不是核心功能，所以就不占用太多资源

static final int nThreads = Runtime.getRuntime().availableProcessors(); public static <T extends BaseExcelOutputVo> List<Workbook> createExcel_v2(List<List<T>> data) { CountDownLatch countDownLatch = new CountDownLatch(data.size()); List<Workbook> excelList = new CopyOnWriteArrayList<>(); if (data.size() > 0) { ExecutorService executor = Executors.newFixedThreadPool(data.size() < nThreads*2 ? data.size() : nThreads*2); data.forEach(list -> executor.execute(new pageTask(countDownLatch, list, excelList))); } try { countDownLatch.await(); } catch (InterruptedException e) { e.printStackTrace(); } return excelList; } static class pageTask<T extends BaseExcelOutputVo> implements Runnable { private CountDownLatch countDownLatch; private List<T> data; private List<SXSSFWorkbook> excelList; public pageTask(CountDownLatch countDownLatch, List<T> data, List<SXSSFWorkbook> excelList) { this.countDownLatch = countDownLatch; this.data = data; this.excelList = excelList; } public pageTask(List<T> data, List<SXSSFWorkbook> excelList）{ this.data = data; this.excelList = excelList; } @Override public void run() { try { Stopwatch sw = Stopwatch.createStarted(); SXSSFWorkbook workbook = new SXSSFWorkbook(100); //设置列头样式 CellStyle headerStyle = workbook.createCellStyle(); headerStyle.setAlignment(CellStyle.ALIGN_CENTER); // 居中格式 headerStyle.setFillForegroundColor(IndexedColors.SKY_BLUE.getIndex());// 背景色 headerStyle.setFillPattern(CellStyle.SOLID_FOREGROUND); //加粗 SXSSFSheet sheet = workbook.createSheet(data.get(0).getExcelTitle()); CellStyle textStyle = workbook.createCellStyle(); DataFormat format = workbook.createDataFormat(); textStyle.setDataFormat(format.getFormat("@")); // 列数 int cols = data.get(0).toExcelHeaders().length; //复制一行用于设置header data.add(0, data.get(0)); int rows = data.size(); int index = 0; for (int rowNum = 0; rowNum < rows; rowNum++) { SXSSFRow row = sheet.createRow(rowNum); for (int colNum = 0; colNum < cols; colNum++) { Cell cell = row.createCell(colNum); if (rowNum == 0) { cell.setCellValue(data.get(0).toExcelHeaders()[colNum]); cell.setCellStyle(headerStyle); } else { cell.setCellStyle(textStyle); cell.setCellType(XSSFCell.CELL_TYPE_STRING); Object obj = data.get(index).toExcelData()[colNum]; if (obj instanceof Date) { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm"); cell.setCellValue(sdf.format((Date) obj)); } else if (obj instanceof Integer) { cell.setCellValue(String.valueOf(obj)); } else if (obj instanceof Double) { cell.setCellValue(String.valueOf(obj)); } else if (obj instanceof Long) { cell.setCellValue(String.valueOf(obj)); } else { cell.setCellValue((String) obj); } } } //log.trace("设置行{}完成，耗时{}",index,rowNum,sw.elapsed(TimeUnit.MILLISECONDS)); index++; } for (int colNum = 0; colNum < cols; colNum++) { sheet.trackAllColumnsForAutoSizing(); sheet.autoSizeColumn(colNum); sheet.setColumnWidth(colNum, sheet.getColumnWidth(colNum) * 17 / 10); } excelList.add(workbook); //workbook.dispose(); log.trace("线程{}创建excel完成，耗时{}", Thread.currentThread().getName(), sw.stop().elapsed(TimeUnit.MILLISECONDS)); } catch (Exception e) { log.trace("线程名{}data大小{}data是否为空{}", Thread.currentThread().getName(), data.size(), data == null); e.printStackTrace(); } finally { if (countDownLatch != null) { countDownLatch.countDown(); } //这里处理完后清空数据列表，让它可以回收，之前没释放导出的时候堆一直往上涨的厉害，释放后少了大概25% data.clear(); data = null; } } }

测试情况，500行一个文件

线程pool-12-thread-7创建excel完成，耗时716 线程pool-12-thread-5创建excel完成，耗时733 线程pool-12-thread-6创建excel完成，耗时734 线程pool-12-thread-1创建excel完成，耗时750 线程pool-12-thread-11创建excel完成，耗时756 线程pool-12-thread-9创建excel完成，耗时776 ... excel导出记录5500条，耗时1564 导出总耗时3074线程pool-12-thread-10创建excel完成，耗时646 ms 线程pool-12-thread-3创建excel完成，耗时648 ms 线程pool-12-thread-6创建excel完成，耗时668 ms 线程pool-12-thread-7创建excel完成，耗时678 ms 线程pool-12-thread-5创建excel完成，耗时682 ms 线程pool-12-thread-9创建excel完成，耗时684 ms 线程pool-12-thread-4创建excel完成，耗时687 ms 线程pool-12-thread-1创建excel完成，耗时707 ms 线程pool-12-thread-2创建excel完成，耗时716 ms 线程pool-12-thread-8创建excel完成，耗时721 ms excel导出记录5000条，耗时1479 ms 导出总耗时2673 ms线程pool-13-thread-1创建excel完成，耗时109 excel导出记录500条，耗时181 导出总耗时436

测试情况，5000行一个文件

线程pool-15-thread-2创建excel完成，耗时35 线程pool-15-thread-1创建excel完成，耗时61 excel导出记录5500条，耗时204 导出总耗时1341线程pool-14-thread-1创建excel完成，耗时56 excel导出记录5000条，耗时137 导出总耗时1103线程pool-12-thread-1创建excel完成，耗时538 excel导出记录500条，耗时708 导出总耗时999

注意不要图快开大量线程把cpu占满了，这样其他请求这机器就处理不了了，而且当线程数超过当前服务器可用线程数时，线程切换频繁，速度也会变慢。

计算最大线程数方法参考

生成excel文件速度优化后，还存在一个问题：这里用于导出的数据是一次性加载到内存中的，当数据量大的时候会占用大量内存，需改用读一部分写一部分的形式。

优化二

采用了边加载边写excel文件的形式减少内存占用，类似于生产者消费者模型，使用队列进行数据的临时存储及交换，生产者线程不停将要写入的数据从数据库取出，经处理后存入队列中，消费者线程即excel线程管理类启动后不停从队列中取数据，当数据积累够指定数量或者所有要写excel的数据已经从队列中取出时，将这些数据交给写excel线程进行写入。大概流程如下：

消费者类

public static class ExportThread<T extends BaseExcelOutputVo> implements Runnable { private LinkedBlockingQueue<T> queue; private boolean isEnd = false; private List<T> list = new CopyOnWriteArrayList<>(); List<Workbook> excelList; HttpServletRequest request; HttpServletResponse response; String downloadName; String fileName; ExecutorService executor; public ExportThread(LinkedBlockingQueue<T> queue,HttpServletRequest request, HttpServletResponse response, String downloadName, String fileName) { this.queue = queue; this.request = request; this.response = response; this.downloadName = downloadName; this.fileName = fileName; } public void setEnd(boolean end) { isEnd = end; } @Override public void run() { try { excelList = new CopyOnWriteArrayList<>(); executor = Executors.newFixedThreadPool(nThreads * 2); while (!isEnd || queue.size() > 0) { while (queue.size() > 0) { list.add(queue.poll(60 * 1000, TimeUnit.MILLISECONDS)); if(list.size() == SINGLE_FILE_SIZE){ break; } } if (list.size() > 0 && (list.size() == SINGLE_FILE_SIZE || (isEnd && queue.size() ==0))) { executor.execute(new pageTask(new CopyOnWriteArrayList(list), excelList)); list.clear(); } } executor.shutdown(); while(!executor.isTerminated()){ } //log.trace("生成excel完成"); if (excelList.size() > 0) { try { downloadFiles(request, response, downloadName, fileName, excelList); } catch (Exception e) { e.printStackTrace(); } } } catch (InterruptedException e) { e.printStackTrace(); } } }