pandas 选取数据 iloc和 loc的用法不太一样,iloc是根据索引, loc是根据行的数值
>>> import pandas as pd>>> import os>>> os.chdir("D:\\")>>> d = pd.read_csv("GWAS_water.qassoc", delimiter= "\s+")>>> d.loc[1:3] CHR SNP BP NMISS BETA SE R2 T P1 1 . 447 44 0.1800 0.1783 0.02369 1.009 0.31852 1 . 449 44 0.2785 0.2473 0.02931 1.126 0.26653 1 . 452 44 0.1800 0.1783 0.02369 1.009 0.3185>>> d.loc[0:3] CHR SNP BP NMISS BETA SE R2 T P0 1 . 410 44 0.2157 0.1772 0.03406 1.217 0.23041 1 . 447 44 0.1800 0.1783 0.02369 1.009 0.31852 1 . 449 44 0.2785 0.2473 0.02931 1.126 0.26653 1 . 452 44 0.1800 0.1783 0.02369 1.009 0.3185>>> d.iloc[0:3] CHR SNP BP NMISS BETA SE R2 T P0 1 . 410 44 0.2157 0.1772 0.03406 1.217 0.23041 1 . 447 44 0.1800 0.1783 0.02369 1.009 0.31852 1 . 449 44 0.2785 0.2473 0.02931 1.126 0.2665>>> d.iloc[1:3,2]1 4472 449Name: BP, dtype: int64>>> d.iloc[0:3,2]0 4101 4472 449Name: BP, dtype: int64>>> d.head() CHR SNP BP NMISS BETA SE R2 T P0 1 . 410 44 0.2157 0.1772 0.03406 1.2170 0.23041 1 . 447 44 0.1800 0.1783 0.02369 1.0090 0.31852 1 . 449 44 0.2785 0.2473 0.02931 1.1260 0.26653 1 . 452 44 0.1800 0.1783 0.02369 1.0090 0.31854 1 . 462 44 0.2548 0.2744 0.02012 0.9286 0.3584>>> d.tail(3) CHR SNP BP NMISS BETA SE R2 T P418704 12 . 19345588 44 -0.2207 0.2558 0.01743 -0.8631 0.393418705 12 . 19345598 44 -0.2207 0.2558 0.01743 -0.8631 0.393418706 12 . 19345611 44 -0.2207 0.2558 0.01743 -0.8631 0.393>>> d.describe() CHR BP NMISS BETA SE \count 418707.000000 4.187070e+05 418707.0 4.186820e+05 418682.00000mean 5.805738 1.442822e+07 44.0 -4.271777e-03 0.21433std 3.392930 8.933882e+06 0.0 2.330019e-01 0.05190min 1.000000 4.100000e+02 44.0 -1.610000e+00 0.1013025% 3.000000 7.345860e+06 44.0 -1.638000e-01 0.1732050% 5.000000 1.371612e+07 44.0 -1.826000e-16 0.2067075% 9.000000 2.051322e+07 44.0 1.391000e-01 0.25010max 12.000000 4.238896e+07 44.0 1.467000e+00 0.67580 R2 T Pcount 418682.000000 4.186820e+05 4.186820e+05mean 0.026268 -1.910774e-02 4.772397e-01std 0.035903 1.095115e+00 2.944290e-01min 0.000000 -5.582000e+00 2.034000e-0825% 0.002969 -7.955000e-01 2.179000e-0150% 0.012930 -8.468000e-16 4.624000e-0175% 0.035910 6.712000e-01 7.254000e-01max 0.531200 6.898000e+00 1.000000e+00>>> d.sort_values(by="P").iloc[0:15] CHR SNP BP NMISS BETA SE R2 T P42870 1 . 32316680 44 1.1870 0.1721 0.5312 6.898 2.034000e-0829301 1 . 22184568 44 1.1870 0.1721 0.5312 6.898 2.034000e-0829302 1 . 22184590 44 1.1870 0.1721 0.5312 6.898 2.034000e-0829306 1 . 22184654 44 1.1870 0.1721 0.5312 6.898 2.034000e-0829305 1 . 22184628 44 1.1870 0.1721 0.5312 6.898 2.034000e-0829304 1 . 22184624 44 1.1870 0.1721 0.5312 6.898 2.034000e-08112212 3 . 14365699 44 1.4670 0.2255 0.5018 6.504 7.490000e-0829254 1 . 22167448 44 1.0780 0.1723 0.4822 6.254 1.713000e-0769291 2 . 9480651 44 1.1140 0.1829 0.4690 6.091 2.939000e-0729299 1 . 22180991 44 0.8527 0.1458 0.4488 5.848 6.574000e-07101391 3 . 6959715 44 0.6782 0.1166 0.4462 5.817 7.285000e-0729333 1 . 22198267 44 0.9252 0.1616 0.4383 5.724 9.888000e-07195513 5 . 20178388 44 1.0350 0.1817 0.4359 5.697 1.082000e-0629295 1 . 22180901 44 0.7469 0.1320 0.4324 5.657 1.236000e-0629300 1 . 22181119 44 0.7469 0.1320 0.4324 5.657 1.236000e-06>>> sort_D = d.sort_values(by="P").iloc[0:5]>>> m_D = d.dropna() #remove NA>>> sort_C = d.sort_values(["P","CHR", "BP"])>>> sort_C.to_csv(file_name, sep='\t', encoding='utf-8')>>> d.sort_values(by="C", ascending=True)>>> sort_D.to_csv("result.txt", sep= " ")>>> sort_D.to_csv("result_no_index.txt", sep= " ", index=False)>>>
参考:
for m, i in enumerate(list(range(1,10))): for n, j in enumerate(list(range(m+1,10))): print i * j