Utilizo BeautifulSoup para web scraping. Por tanto, necesito analizar esta tabla: Google Finance
Como puede ver, hay "Datos anuales" y "Datos trimestrales". Cuando extraigo la tabla, Python devuelve solo datos trimestrales, pero no tengo idea de cómo extraerlos anuales. ¿Alguien sabe? A continuación se muestra el código HTML que representa este enlace.

<div class="g-unit g-first">
View:
<a id="interim" class="id-interim nac" target="_blank">Quarterly Data</a>&nbsp;|&nbsp;
<a id="annual" class="id-annual ac" target="_blank">Annual Data</a>
</div>

Aquí está mi código:

import requests
from bs4 import BeautifulSoup
import pandas as pd

raw_data = {'Param': ['Total Revenue', 'Cost of revenue', 'Gross profit', 
                      'Operating expenses','Research Development'],
             '2016': [123, 234343, 3423, 343, 323],
             '2015': [3432423, 2342, 2342342, 356856, 36934],
             '2014': [42, 52, 36, 24, 73],
             '2013': [42, 52, 36, 24, 73]}

url = 'https://www.google.com/finance?q=NASDAQ%3AAAPL&fstype=ii&ei=JQHoWMjKCcjDsAHAhqS4DA'

r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
raw_df = pd.DataFrame(raw_data, columns = ['Param', '2016', '2015', '2014','2013'])

# Find all the <tr> tag pairs, skip the first one, then for each.

for row in table.find_all('tr')[1:]:
    col = row.find_all('td')
    column_1 = col[0].string.strip()
    Revenue.append(column_1)

    column_2 = col[1].string.strip()
    _2016_.append(column_2)

    column_3 = col[2].string.strip()
    _2015_.append(column_3)

    column_4 = col[3].string.strip()
    _2014_.append(column_4)

    column_5 = col[4].string.strip()
    _2013_.append(column_5)

columns = {'In Millions of USD': Revenue, '52 weeks ending 2016': _2016_, '52 weeks ending 2015': _2015_, '52 weeks ending 2014': _2014_, '52 weeks ending 2013': _2013_}
df = pd.DataFrame(columns)
0
Piskarev Dmitry 8 abr. 2017 a las 01:58

2 respuestas

La mejor respuesta

Su código no funciona, pero tengo una idea de lo que realmente quiere. Desea raspar la tabla de datos anuales en un marco de datos de panda. Espero que esto ayude.

import requests
from bs4 import BeautifulSoup
import pandas as pd

params, _2016_, _2015_, _2014_, _2013_ = [], [], [], [], []
url = 'https://www.google.com/finance?q=NASDAQ%3AAAPL&fstype=ii&ei=JQHoWMjKCcjDsAHAhqS4DA'

r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
#print(soup)                                             #to get an idea on the class id
table = soup.find("div", {"class": "id-incannualdiv"})   #this is the table for annual data report
rows = [t.text for t in table.find_all("td")]            #get all rows then convert rows into columns
i=0
for r in rows:
    if i%5 == 0:
        params.append(r.rstrip("\r\n"))
    if i%5 == 1:
        _2016_.append(r)
    if i%5 == 2:
        _2015_.append(r)
    if i%5 == 3:
        _2014_.append(r)
    if i%5 == 4:
        _2013_.append(r)
    i+=1
df = pd.DataFrame(list(zip(params, _2016_, _2015_, _2014_, _2013_)), \
 columns=['In Millions of USD', '52 weeks ending 2016', '52 weeks ending 2015', '52 weeks ending 2014', '52 weeks ending 2013'])
df.head()

enter image description here

0
âńōŋŷXmoůŜ 9 abr. 2017 a las 20:58

Simplemente agregue una matriz de tablas que tenga tablas de datos anuales de un cuarto

import requests
from bs4 import BeautifulSoup
import pandas as pd

raw_data = {'Param': ['Total Revenue', 'Cost of revenue', 'Gross profit', 
                      'Operating expenses','Research Development'],
             '2016': [123, 234343, 3423, 343, 323],
             '2015': [3432423, 2342, 2342342, 356856, 36934],
             '2014': [42, 52, 36, 24, 73],
             '2013': [42, 52, 36, 24, 73]}

url = 'https://www.google.com/finance?q=NASDAQ%3AAAPL&fstype=ii&ei=JQHoWMjKCcjDsAHAhqS4DA'

r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
raw_df = pd.DataFrame(raw_data, columns = ['Param', '2016', '2015', '2014','2013'])
Revenue,_2016_,_2015_,_2014_,_2013_=[],[],[],[],[]
# Find all the <tr> tag pairs, skip the first one, then for each.
table=soup.find_all('table')[1:3]
for tab in table:
    for row in tab.find_all('tr')[1:]:
        col = row.find_all('td')
        column_1 = col[0].string.strip()
        Revenue.append(column_1)

        column_2 = col[1].string.strip()
        _2016_.append(column_2)

        column_3 = col[2].string.strip()
        _2015_.append(column_3)

        column_4 = col[3].string.strip()
        _2014_.append(column_4)

        column_5 = col[4].string.strip()
        _2013_.append(column_5)

columns = {'In Millions of USD': Revenue, '52 weeks ending 2016': _2016_, '52 weeks ending 2015': _2015_, '52 weeks ending 2014': _2014_, '52 weeks ending 2013': _2013_}
df = pd.DataFrame(columns)
0
rakesh 9 abr. 2017 a las 21:24