Rosalind统计编程问题DNA序列上由ACTG组成的K(4)-mers出现的次数。
k-Mer Composition
For a fixed positive integer k, order all possible k-mers taken from an underlying alphabet lexicographically.
Then the k-mer composition of a string s can be represented by an array A for which A[m] denotes the number of times that the mth k-mer (with respect to the lexicographic order) appears in s. A DNA string s in FASTA format (having length at most 100 kbp). :
Rosalind_6431 CTTCGAAAGTTTGGGCCGAGTCTTACAGTCGGTCTTGAAGCAAAGTAACGAACTCCACGG CCCTGACTACCGAACCAGTTGTGAGTACTCAACTGGGTGAGAGTGCAGTCCCTATTGAGT TTCCGAGACTCACCGGGATTTTCGATCCAGCCTCAGTCCAGTCTTGTGGCCAACTCACCA AATGACGTTGGAATATCCCTGTCTAGCTCACGCAGTACTTAGTAAGAGGTCGCTGCAGCG GGGCAAGGAGATCGGAAAATGTGCTCTATATGCGACTAAAGCTCCTAACTTACACGTAGA CTTGCCCGTGTTAAAAACTCGGCTCACATGCTGTCTGCGGCTGGCTGTATACAGTATCTA CCTAATACCCTTCAGTTCGCCGCACAAAAGCTGGGAGTTACCGCGGAAATCACAG
The 4-mer composition of s. :
4 1 4 3 0 1 1 5 1 3 1 2 2 1 2 0 1 1 3 1 2 1 3 1 1 1 1 2 2 5 1 3 0 2 2 1 1 1 1 3 1 0 0 1 5 5 1 5 0 2 0 2 1 2 1 1 1 2 0 1 0 0 1 1 3 2 1 0 3 2 3 0 0 2 0 8 0 0 1 0 2 1 3 0 0 0 1 4 3 2 1 1 3 1 2 1 3 1 2 1 2 1 1 1 2 3 2 1 1 0 1 1 3 2 1 2 6 2 1 1 1 2 3 3 3 2 3 0 3 2 1 1 0 0 1 4 3 0 1 5 0 2 0 1 2 1 3 0 1 2 2 1 1 0 3 0 0 4 5 0 3 0 2 1 1 3 0 3 2 2 1 1 0 2 1 0 2 2 1 2 0 2 2 5 2 2 1 1 2 1 2 2 2 2 1 1 3 4 0 2 1 1 0 1 2 2 1 1 1 5 2 0 3 2 1 1 2 2 3 0 3 0 1 3 1 2 3 0 2 1 2 2 1 2 3 0 1 2 3 1 1 3 1 0 1 1 3 0 2 1 2 2 0 2 1 1
题目给我们一个序列,我们需要统计序列中的每个4-mers以空间分隔输出到屏幕的次数。
解题思路如下:
public class kMer_Composition {
public static void main(String[] args) {
//1.获取ATCG全排列组合 String alphabet = "ACGT"; List<String> Order = new ArrayList<>(); kmers(alphabet, Order, 4, 0); //2.产生ATCG新产生的全排列组合的集合和保存kmers进map集合 //ArrayList元素转化为LinkedHashMap标签 Map<String, Integer> maps = new LinkedHashMap<
>
();
for (int i = 0; i < Order.size(); i++) {
maps.put(Order.get(i), 0);
}
//3.输入待统计kmers的序列
ArrayList<String> fasta = BufferedReader2("C:/Users/Administrator/Desktop/rosalind_kmer.txt", "fasta");
String DNA = fasta.get(0);
for (int i = 0; i <= DNA.length() - 4; i++) {
String ch = DNA.substring(i, i + 4);
if (maps.containsKey(ch)){
maps.put(ch, maps.get(ch)+1);
}
}
//遍历HashMap值进行输出
Set<String> keys = maps.keySet();
for (String key : keys) {
int value = maps.get(key);
System.out.print(value + " ");
}
}
//方法1.获取ATCG全排列组合
public static void kmers(String alphabet, List<String> Order, int n, int i) {
//初始默认i为0,意为从第一个字母开始遍历产生kmers
while (i < n) {
i += 1;
//从第二轮循环开始向原有元素累加新字符
if (i != 1) {
int size = Order.size();
for (int k = 0; k < size; k++) {
for (int j = 0; j < alphabet.length(); j++) {
Order.add(Order.get(k) + alphabet.charAt(j));
}
}
//使用列表迭代器循环删除元素
ListIterator<String> lit = Order.listIterator();
while (lit.hasNext()) {
String s = lit.next();
if (s.length() < i) {
lit.remove();//迭代器删除,可以防止删除改变ArrayList大小改变导致的并发修改异常。
}
}
} else {
//第一轮循环直接添加元素进数组
for (int j = 0; j < alphabet.length(); j++) {
Order.add(String.valueOf(alphabet.charAt(j)));
}
}
}
}
//IO流读取fasta序列
public static ArrayList<String> BufferedReader2(String path,String choose) {
//返回值类型是新建集合大类,此处是Set而非哈希。
BufferedReader reader;
ArrayList<String> tag = new java.util.ArrayList<String>();
ArrayList<String> fasta = new java.util.ArrayList<String>();
try {
reader = new BufferedReader(new FileReader(path));
String line = reader.readLine();
StringBuilder sb = new StringBuilder();
while (line != null) {
//多次匹配带有“>”的行,\w代表0—9A—Z_a—z,需要转义。\W代表非0—9A—Z_a—z。
if (line.matches(">[\\w*|\\W*]*")){
tag.add(line);
//定义字符串变量seq保存删除换行符的序列信息
if (sb.length()!=0){
String seq = sb.toString();
fasta.add(seq);
sb.delete(0, sb.length());//清空StringBuilder中全部元素
}
}else{
sb.append(line);//重新向StringBuilder添加元素
}
// read next line
line = reader.readLine();
}
String seq = sb.toString();
fasta.add(seq);
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
if (choose.equals("tag")){
return tag;
}
return fasta;
}
}