初始化设置
# common utils
import os
import math
import itertools
import warnings
import numpy as np
import pandas as pd
from collections import Counter
import multiprocessing
import pydicom # 用于读取dcm文件
import glob
import scipy.misc
import functools
# pytorch utils
import torch
import torchvision
from torch.utils.data import DataLoader,Dataset
import torch.nn as nn
import cv2
from torchvision import models
import torch.nn.functional as F
import pytorch_lightning as pl
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split
from skimage.io import imread
# others
import matplotlib.pyplot as plt
from PIL import Image
%matplotlib inline
plt.rcParams['figure.figsize'] = (3.5, 2.5)
%config InlineBackend.figure_format = 'svg'
warnings.filterwarnings("ignore")
end = ''
DataPath = '/media/yuansh/14THHD/mimic_cxr/dataset/'
OutPath = '/media/yuansh/14THHD/mimic_cxr/out_path/'
info_dir = '/media/yuansh/14THHD/1-MIMIC/dataset/mimic-iv-0.4'
# 读取文件
dcm_list = list(np.load(OutPath+'dcm_list.npy')) #所有dicom数据的路径
jpg_list = [i.replace('dcm','jpg') for i in dcm_list] # 所有jpg数据的路径
file_id = [os.path.split(i)[-1][:-4] for i in dcm_list] # 所有文件的前缀
img_info = pd.read_csv(OutPath + 'Sample_infos.csv',index_col=0) # imgs的头文件
sepsis_info = pd.read_csv(info_dir + '/meta-info/mimic_iv_sepsis3.csv') # 脓毒症信息
shock_info = pd.read_csv(info_dir + '/meta-info/mimic_iv_septic_shock.csv') # 脓毒症休克信息
use_imgs = img_info[img_info.PatientID.isin(sepsis_info.subject_id)] # 浓度镇且有图片的信息
sepsis_imgs = list(np.load(OutPath+'imgs_sepsis.npy')) #脓毒症数据的路径
# 对use_imgs中的studyID改为字符串
use_imgs['StudyTime'] = use_imgs['StudyTime'].astype(str)
dcm_list = list(np.load(OutPath+'dcm_list.npy')) #所有dicom数据的路径
jpg_list = [i.replace('dcm','jpg') for i in dcm_list] # 所有jpg数据的路径
file_id = [os.path.split(i)[-1][:-4] for i in dcm_list] # 所有文件的前缀
img_info = pd.read_csv(OutPath + 'Sample_infos.csv',index_col=0) # imgs的头文件
sepsis_info = pd.read_csv(info_dir + '/meta-info/mimic_iv_sepsis3.csv') # 脓毒症信息
shock_info = pd.read_csv(info_dir + '/meta-info/mimic_iv_septic_shock.csv') # 脓毒症休克信息
use_imgs = img_info[img_info.PatientID.isin(sepsis_info.subject_id)] # 浓度镇且有图片的信息
sepsis_imgs = list(np.load(OutPath+'imgs_sepsis.npy')) #脓毒症数据的路径
use_imgs['StudyTime'] = use_imgs['StudyTime'].astype(str)
dcm_list : Dicom文件的具体路径
jpg_list : jpg文件的具体路径
fileid :dicom或者jpg文件的前缀
img_info : dicom文件的头文件信息
seosis_info :脓毒症患者信息
shock_info :休克患者信息的studyid以及休克时间
use_imgs : 脓毒症患者图片
sepsis_imgs:脓毒症患者图片路径
print("脓毒症患者数量: ",len(set(sepsis_info.subject_id)))
print("脓毒症患者图片数量: ",len(sepsis_imgs))
print("有图片的脓毒症患者数量: ",len(set(use_imgs.PatientID)))
脓毒症患者数量: 10375
脓毒症患者图片数量: 77677
有图片的脓毒症患者数量: 4929
- 定义函数
# 数据增强
def DataTransforms(phase=None):
if phase == 'train':
data = torchvision.transforms.Compose([
torchvision.transforms.Resize(size=(224, 224)),
torchvision.transforms.RandomRotation(degrees=(-20, +20)),
torchvision.transforms.ColorJitter((1.2, 1.5)),
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize([0.485, 0.456, 0.406],
[0.229, 0.224, 0.225])
])
elif phase == 'test' or phase == 'val':
data = torchvision.transforms.Compose([
torchvision.transforms.Resize(size=(224, 224)),
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize([0.485, 0.456, 0.406],
[0.229, 0.224, 0.225])
])
return data
- 数据可视化
# Plot the data
f, ax = plt.subplots(2,4, figsize=(12,4))
for i in range(8):
img = imread(jpg_list[i])
ax[i//4, i%4].imshow(img, cmap='gray')
if i<4:
ax[i//4, i%4].set_title(str(i))
else:
ax[i//4, i%4].set_title(str(i))
ax[i//4, i%4].axis('off')
ax[i//4, i%4].set_aspect('auto')
plt.show();
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-V6syrF6I-1619081821173)(output_9_0.svg)]
# 所有的脓毒镇患者
# sample_info.head()
# 脓毒针休克
# sample_info[sample_info.stay_id.isin(shock_info.stay_id)].head()
# 脓毒针有图片的
# sample_info[sample_info.subject_id.isin(img_info.PatientID)].head()
- 提取数据
脓毒症且有图片的数据
# imgs_index = list(use_imgs.index)
#
# def get_imgs(i):
# if pd.Series(os.path.split(i)[-1][:-4]).isin(imgs_index).all():
# return i
#
# # 多进进程处理
# p = multiprocessing.Pool(24)
# imgs_list = p.map(get_imgs, dcm_list)
# p.close() # 记得关闭
# p.join()
# imgs_list = list(filter(None, imgs_list))
# np.save( OutPath+'imgs_sepsis.npy',imgs_list)
- StudyTime
193516.234 代表 19:35:16
44401.953 代表 4:44:01
# 对头文件中的studytime 进行修改一下
check_times = np.array([str(i).split('.')[0] for i in use_imgs.StudyTime])
use_imgs['StudyTime'] = check_times
check_times = []
for i in list(use_imgs.StudyTime):
if len(i) < 2:
i = '00000' + i
check_times.append(i)
elif len(i) < 3:
i = '0000' + i
check_times.append(i)
elif len(i) < 4:
i = '000' + i
check_times.append(i)
elif len(i) < 5:
i = '00' + i
check_times.append(i)
elif len(i) < 6:
i = '0' + i
check_times.append(i)
else :
check_times.append(i)
ids = [len(i) ==6 for i in check_times]
secend = [int(i[-2:]) for i in check_times]
minut = [int(i[-4:-2]) for i in check_times]
hour = [int(i[:-4]) for i in check_times]
max(secend)
max(minut)
max(hour)
59
59
23
- 先读取几张图片看一下头文件的信息(猜测)
如果患者为正面体位则 PatientOrientation
== “L”,“R”
如果患者为侧面体位则 PatientOrientation
== “A”,“P”
# python 多进程处理for循环
1. 定义迭代器 for i in ...
2. 将要做的事情分装成函数,最后output成一个值,如果需要多值的output可以设置成字典类型或者tuple类型
3. 使用多进程
初始化迭代器
items = [x for x in range(len(sepsis_imgs))]
定义函数
def check_PatientOrientation(i):
ds = pydicom.dcmread(sepsis_imgs[i])
if hasattr(ds, 'PatientOrientation'):
ids = ds.PatientOrientation
else:
ids = ["yuansh","yuansh"]
return ids
# 多进进程处理
p = multiprocessing.Pool(26)
check_list_only = p.map(check_PatientOrientation, items)
p.close() # 记得关闭
p.join()
np.save( OutPath+'PatientOrientation.npy',check_list_only)
# 获取 PatientOrientation 信息
PatientOrientation = list(np.load(OutPath+'PatientOrientation.npy',allow_pickle=True)) #PatientOrientation
# 获取左右信息
ids1 = []
ids2 = []
for i in PatientOrientation:
if i == '':
ids1.append('yuansh')
ids2.append('yuansh')
else :
ids1.append(i[0])
ids2.append(i[1])
Counter(ids2)
Counter(ids1)
Counter({'F': 75512, 'yuansh': 2158, 'R': 1, 'A': 1, 'L': 4, 'FP': 1})
Counter({'A': 9092,
'L': 56863,
'R': 4748,
'P': 4805,
'yuansh': 2158,
'F': 3,
'H': 3,
'LP': 4,
'LA': 1})
# 验证当有L,R存在时候,都为正面
import random
ids = np.where([i == 'A' for i in ids1])[0]
ids = [ids[random.randint(0,len(ids))] for i in range(8)] # 第一个L中随机抽取8个
f, ax = plt.subplots(2,4, figsize=(12,4))
for i in range(8):
img = imread(sepsis_imgs[ids[i]])
ax[i//4, i%4].imshow(img, cmap='gray')
if i<4:
ax[i//4, i%4].set_title(str(i))
else:
ax[i//4, i%4].set_title(str(i))
ax[i//4, i%4].axis('off')
ax[i//4, i%4].set_aspect('auto')
plt.show();
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-tqZvQr81-1619081821174)(output_19_0.svg)]
# 验证当有L,R存在时候,都为正面
import random
ids = np.where([i == 'P' for i in ids1])[0]
ids = [ids[random.randint(0,len(ids))] for i in range(8)] # 第一个L中随机抽取8个
f, ax = plt.subplots(2,4, figsize=(12,4))
for i in range(8):
img = imread(sepsis_imgs[ids[i]])
ax[i//4, i%4].imshow(img, cmap='gray')
if i<4:
ax[i//4, i%4].set_title(str(i))
else:
ax[i//4, i%4].set_title(str(i))
ax[i//4, i%4].axis('off')
ax[i//4, i%4].set_aspect('auto')
plt.show();
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-gTnfuaew-1619081821175)(output_20_0.svg)]
# 验证当有L,R存在时候,都为正面
import random
ids = np.where([i == 'R' for i in ids1])[0]
ids = [ids[random.randint(0,len(ids))] for i in range(8)] # 第一个R中随机抽取8个
f, ax = plt.subplots(2,4, figsize=(12,4))
for i in range(8):
img = imread(sepsis_imgs[ids[i]])
ax[i//4, i%4].imshow(img, cmap='gray')
if i<4:
ax[i//4, i%4].set_title(str(i))
else:
ax[i//4, i%4].set_title(str(i))
ax[i//4, i%4].axis('off')
ax[i//4, i%4].set_aspect('auto')
plt.show();
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-gTMe2RrB-1619081821176)(output_21_0.svg)]
# 验证当有L,R存在时候,都为正面
import random
ids = np.where([i == 'L' for i in ids1])[0]
ids = [ids[random.randint(0,len(ids))] for i in range(8)] # 第一个L中随机抽取8个
f, ax = plt.subplots(2,4, figsize=(12,4))
for i in range(8):
img = imread(sepsis_imgs[ids[i]])
ax[i//4, i%4].imshow(img, cmap='gray')
if i<4:
ax[i//4, i%4].set_title(str(i))
else:
ax[i//4, i%4].set_title(str(i))
ax[i//4, i%4].axis('off')
ax[i//4, i%4].set_aspect('auto')
plt.show();
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-q8vKDAUS-1619081821177)(output_22_0.svg)]